diff options
29 files changed, 345 insertions, 414 deletions
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index c6dec5fc20aa..047d3e40e470 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -172,14 +172,9 @@ static void default_idle(void) local_irq_enable(); } -void (*pm_idle)(void) = default_idle; -EXPORT_SYMBOL(pm_idle); - /* - * The idle thread, has rather strange semantics for calling pm_idle, - * but this is what x86 does and we need to do the same, so that - * things like cpuidle get called in the same way. The only difference - * is that we always respect 'hlt_counter' to prevent low power idle. + * The idle thread. + * We always respect 'hlt_counter' to prevent low power idle. */ void cpu_idle(void) { @@ -210,10 +205,10 @@ void cpu_idle(void) } else if (!need_resched()) { stop_critical_timings(); if (cpuidle_idle_call()) - pm_idle(); + default_idle(); start_critical_timings(); /* - * pm_idle functions must always + * default_idle functions must always * return with IRQs enabled. */ WARN_ON(irqs_disabled()); diff --git a/arch/arm/mach-davinci/cpuidle.c b/arch/arm/mach-davinci/cpuidle.c index 9107691adbdb..5ac9e9384b15 100644 --- a/arch/arm/mach-davinci/cpuidle.c +++ b/arch/arm/mach-davinci/cpuidle.c @@ -25,35 +25,44 @@ #define DAVINCI_CPUIDLE_MAX_STATES 2 -struct davinci_ops { - void (*enter) (u32 flags); - void (*exit) (u32 flags); - u32 flags; -}; +static DEFINE_PER_CPU(struct cpuidle_device, davinci_cpuidle_device); +static void __iomem *ddr2_reg_base; +static bool ddr2_pdown; + +static void davinci_save_ddr_power(int enter, bool pdown) +{ + u32 val; + + val = __raw_readl(ddr2_reg_base + DDR2_SDRCR_OFFSET); + + if (enter) { + if (pdown) + val |= DDR2_SRPD_BIT; + else + val &= ~DDR2_SRPD_BIT; + val |= DDR2_LPMODEN_BIT; + } else { + val &= ~(DDR2_SRPD_BIT | DDR2_LPMODEN_BIT); + } + + __raw_writel(val, ddr2_reg_base + DDR2_SDRCR_OFFSET); +} /* Actual code that puts the SoC in different idle states */ static int davinci_enter_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; - struct davinci_ops *ops = cpuidle_get_statedata(state_usage); - - if (ops && ops->enter) - ops->enter(ops->flags); + davinci_save_ddr_power(1, ddr2_pdown); index = cpuidle_wrap_enter(dev, drv, index, arm_cpuidle_simple_enter); - if (ops && ops->exit) - ops->exit(ops->flags); + davinci_save_ddr_power(0, ddr2_pdown); return index; } -/* fields in davinci_ops.flags */ -#define DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN BIT(0) - static struct cpuidle_driver davinci_idle_driver = { .name = "cpuidle-davinci", .owner = THIS_MODULE, @@ -70,45 +79,6 @@ static struct cpuidle_driver davinci_idle_driver = { .state_count = DAVINCI_CPUIDLE_MAX_STATES, }; -static DEFINE_PER_CPU(struct cpuidle_device, davinci_cpuidle_device); -static void __iomem *ddr2_reg_base; - -static void davinci_save_ddr_power(int enter, bool pdown) -{ - u32 val; - - val = __raw_readl(ddr2_reg_base + DDR2_SDRCR_OFFSET); - - if (enter) { - if (pdown) - val |= DDR2_SRPD_BIT; - else - val &= ~DDR2_SRPD_BIT; - val |= DDR2_LPMODEN_BIT; - } else { - val &= ~(DDR2_SRPD_BIT | DDR2_LPMODEN_BIT); - } - - __raw_writel(val, ddr2_reg_base + DDR2_SDRCR_OFFSET); -} - -static void davinci_c2state_enter(u32 flags) -{ - davinci_save_ddr_power(1, !!(flags & DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN)); -} - -static void davinci_c2state_exit(u32 flags) -{ - davinci_save_ddr_power(0, !!(flags & DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN)); -} - -static struct davinci_ops davinci_states[DAVINCI_CPUIDLE_MAX_STATES] = { - [1] = { - .enter = davinci_c2state_enter, - .exit = davinci_c2state_exit, - }, -}; - static int __init davinci_cpuidle_probe(struct platform_device *pdev) { int ret; @@ -124,11 +94,7 @@ static int __init davinci_cpuidle_probe(struct platform_device *pdev) ddr2_reg_base = pdata->ddr2_ctlr_base; - if (pdata->ddr2_pdown) - davinci_states[1].flags |= DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN; - cpuidle_set_statedata(&device->states_usage[1], &davinci_states[1]); - - device->state_count = DAVINCI_CPUIDLE_MAX_STATES; + ddr2_pdown = pdata->ddr2_pdown; ret = cpuidle_register_driver(&davinci_idle_driver); if (ret) { diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index cb0956bc96ed..c7002d40a9b0 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -97,14 +97,9 @@ static void default_idle(void) local_irq_enable(); } -void (*pm_idle)(void) = default_idle; -EXPORT_SYMBOL_GPL(pm_idle); - /* - * The idle thread, has rather strange semantics for calling pm_idle, - * but this is what x86 does and we need to do the same, so that - * things like cpuidle get called in the same way. The only difference - * is that we always respect 'hlt_counter' to prevent low power idle. + * The idle thread. + * We always respect 'hlt_counter' to prevent low power idle. */ void cpu_idle(void) { @@ -122,10 +117,10 @@ void cpu_idle(void) local_irq_disable(); if (!need_resched()) { stop_critical_timings(); - pm_idle(); + default_idle(); start_critical_timings(); /* - * pm_idle functions should always return + * default_idle functions should always return * with IRQs enabled. */ WARN_ON(irqs_disabled()); diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 3e16ad9b0a99..8061426b7df5 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -39,12 +39,6 @@ int nr_l1stack_tasks; void *l1_stack_base; unsigned long l1_stack_len; -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void) = NULL; -EXPORT_SYMBOL(pm_idle); - void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); @@ -81,7 +75,6 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - void (*idle)(void) = pm_idle; #ifdef CONFIG_HOTPLUG_CPU if (cpu_is_offline(smp_processor_id())) diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index 7f65be6f7f17..104ff4dd9b98 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -54,11 +54,6 @@ void enable_hlt(void) EXPORT_SYMBOL(enable_hlt); -/* - * The following aren't currently used. - */ -void (*pm_idle)(void); - extern void default_idle(void); void (*pm_power_off)(void); @@ -77,16 +72,12 @@ void cpu_idle (void) while (1) { rcu_idle_enter(); while (!need_resched()) { - void (*idle)(void); /* * Mark this as an RCU critical section so that * synchronize_kernel() in the unload path waits * for our completion. */ - idle = pm_idle; - if (!idle) - idle = default_idle; - idle(); + default_idle(); } rcu_idle_exit(); schedule_preempt_disabled(); diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 31360cbbd5f8..e34f565f595a 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -57,8 +57,6 @@ void (*ia64_mark_idle)(int); unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -void (*pm_idle) (void); -EXPORT_SYMBOL(pm_idle); void (*pm_power_off) (void); EXPORT_SYMBOL(pm_power_off); @@ -301,7 +299,6 @@ cpu_idle (void) if (mark_idle) (*mark_idle)(1); - idle = pm_idle; if (!idle) idle = default_idle; (*idle)(); diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index aaefd9b94f2f..2029cc0d2fc6 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -1051,7 +1051,6 @@ cpu_init (void) max_num_phys_stacked = num_phys_stacked; } platform_cpu_init(); - pm_idle = default_idle; } void __init diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 765d0f57c787..bde899e155d3 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -44,36 +44,10 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return tsk->thread.lr; } -/* - * Powermanagement idle function, if any.. - */ -static void (*pm_idle)(void) = NULL; - void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); /* - * We use this is we don't have any better - * idle routine.. - */ -static void default_idle(void) -{ - /* M32R_FIXME: Please use "cpu_sleep" mode. */ - cpu_relax(); -} - -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->work.need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle (void) -{ - /* M32R_FIXME */ - cpu_relax(); -} - -/* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a * low exit latency (ie sit in a loop waiting for @@ -84,14 +58,8 @@ void cpu_idle (void) /* endless idle loop with no priority at all */ while (1) { rcu_idle_enter(); - while (!need_resched()) { - void (*idle)(void) = pm_idle; - - if (!idle) - idle = default_idle; - - idle(); - } + while (!need_resched()) + cpu_relax(); rcu_idle_exit(); schedule_preempt_disabled(); } @@ -120,21 +88,6 @@ void machine_power_off(void) /* M32R_FIXME */ } -static int __init idle_setup (char *str) -{ - if (!strncmp(str, "poll", 4)) { - printk("using poll in idle threads.\n"); - pm_idle = poll_idle; - } else if (!strncmp(str, "sleep", 4)) { - printk("using sleep in idle threads.\n"); - pm_idle = default_idle; - } - - return 1; -} - -__setup("idle=", idle_setup); - void show_regs(struct pt_regs * regs) { printk("\n"); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index a5b74f729e5b..6ff2dcff3410 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -41,7 +41,6 @@ void show_regs(struct pt_regs *regs) regs->msr, regs->ear, regs->esr, regs->fsr); } -void (*pm_idle)(void); void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); @@ -98,8 +97,6 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - void (*idle)(void) = pm_idle; - if (!idle) idle = default_idle; diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index eb09f5a552ff..84f4e97e3074 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -37,12 +37,6 @@ #include "internal.h" /* - * power management idle function, if any.. - */ -void (*pm_idle)(void); -EXPORT_SYMBOL(pm_idle); - -/* * return saved PC of a blocked thread. */ unsigned long thread_saved_pc(struct task_struct *tsk) @@ -113,7 +107,6 @@ void cpu_idle(void) void (*idle)(void); smp_rmb(); - idle = pm_idle; if (!idle) { #if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) idle = poll_idle; diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c index 7d618feb1b72..5e8a3b6d6bc6 100644 --- a/arch/openrisc/kernel/idle.c +++ b/arch/openrisc/kernel/idle.c @@ -39,11 +39,6 @@ void (*powersave) (void) = NULL; -static inline void pm_idle(void) -{ - barrier(); -} - void cpu_idle(void) { set_thread_flag(TIF_POLLING_NRFLAG); diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 0c910163caa3..3d5a1b387cc0 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -22,7 +22,7 @@ #include <asm/smp.h> #include <asm/bl_bit.h> -void (*pm_idle)(void); +static void (*sh_idle)(void); static int hlt_counter; @@ -103,9 +103,9 @@ void cpu_idle(void) /* Don't trace irqs off for idle */ stop_critical_timings(); if (cpuidle_idle_call()) - pm_idle(); + sh_idle(); /* - * Sanity check to ensure that pm_idle() returns + * Sanity check to ensure that sh_idle() returns * with IRQs enabled */ WARN_ON(irqs_disabled()); @@ -123,13 +123,13 @@ void __init select_idle_routine(void) /* * If a platform has set its own idle routine, leave it alone. */ - if (pm_idle) + if (sh_idle) return; if (hlt_works()) - pm_idle = default_idle; + sh_idle = default_idle; else - pm_idle = poll_idle; + sh_idle = poll_idle; } void stop_this_cpu(void *unused) diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index c1e01914fd98..2c7baa4c4505 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -118,6 +118,7 @@ extern unsigned long get_wchan(struct task_struct *); extern struct task_struct *last_task_used_math; #define cpu_relax() barrier() +extern void (*sparc_idle)(void); #endif diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c index 348fa1aeabce..eefda32b595e 100644 --- a/arch/sparc/kernel/apc.c +++ b/arch/sparc/kernel/apc.c @@ -20,6 +20,7 @@ #include <asm/uaccess.h> #include <asm/auxio.h> #include <asm/apc.h> +#include <asm/processor.h> /* Debugging * @@ -158,7 +159,7 @@ static int apc_probe(struct platform_device *op) /* Assign power management IDLE handler */ if (!apc_no_idle) - pm_idle = apc_swift_idle; + sparc_idle = apc_swift_idle; printk(KERN_INFO "%s: power management initialized%s\n", APC_DEVNAME, apc_no_idle ? " (CPU idle disabled)" : ""); diff --git a/arch/sparc/kernel/leon_pmc.c b/arch/sparc/kernel/leon_pmc.c index 4e174321097d..708bca435219 100644 --- a/arch/sparc/kernel/leon_pmc.c +++ b/arch/sparc/kernel/leon_pmc.c @@ -9,6 +9,7 @@ #include <asm/leon_amba.h> #include <asm/cpu_type.h> #include <asm/leon.h> +#include <asm/processor.h> /* List of Systems that need fixup instructions around power-down instruction */ unsigned int pmc_leon_fixup_ids[] = { @@ -69,9 +70,9 @@ static int __init leon_pmc_install(void) if (sparc_cpu_model == sparc_leon) { /* Assign power management IDLE handler */ if (pmc_leon_need_fixup()) - pm_idle = pmc_leon_idle_fixup; + sparc_idle = pmc_leon_idle_fixup; else - pm_idle = pmc_leon_idle; + sparc_idle = pmc_leon_idle; printk(KERN_INFO "leon: power management initialized\n"); } diff --git a/arch/sparc/kernel/pmc.c b/arch/sparc/kernel/pmc.c index dcbb62f63068..8b7297faca79 100644 --- a/arch/sparc/kernel/pmc.c +++ b/arch/sparc/kernel/pmc.c @@ -17,6 +17,7 @@ #include <asm/oplib.h> #include <asm/uaccess.h> #include <asm/auxio.h> +#include <asm/processor.h> /* Debug * @@ -63,7 +64,7 @@ static int pmc_probe(struct platform_device *op) #ifndef PMC_NO_IDLE /* Assign power management IDLE handler */ - pm_idle = pmc_swift_idle; + sparc_idle = pmc_swift_idle; #endif printk(KERN_INFO "%s: power management initialized\n", PMC_DEVNAME); diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index be8e862badaf..62eede13831a 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -43,8 +43,7 @@ * Power management idle function * Set in pm platform drivers (apc.c and pmc.c) */ -void (*pm_idle)(void); -EXPORT_SYMBOL(pm_idle); +void (*sparc_idle)(void); /* * Power-off handler instantiation for pm.h compliance @@ -75,8 +74,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ for (;;) { while (!need_resched()) { - if (pm_idle) - (*pm_idle)(); + if (sparc_idle) + (*sparc_idle)(); else cpu_relax(); } diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 62bad9fed03e..872d7e22d847 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -45,11 +45,6 @@ static const char * const processor_modes[] = { "UK18", "UK19", "UK1A", "EXTN", "UK1C", "UK1D", "UK1E", "SUSR" }; -/* - * The idle thread, has rather strange semantics for calling pm_idle, - * but this is what x86 does and we need to do the same, so that - * things like cpuidle get called in the same way. - */ void cpu_idle(void) { /* endless idle loop with no priority at all */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 225543bf45a5..1b635861401c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1912,6 +1912,7 @@ config APM_DO_ENABLE this feature. config APM_CPU_IDLE + depends on CPU_IDLE bool "Make CPU Idle calls when idle" ---help--- Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index bcdff997668c..2f366d0ac6b4 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -4,7 +4,8 @@ #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 -#define MWAIT_MAX_NUM_CSTATES 8 +#define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) +#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 433a59fb1a74..8d013f5153bc 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -103,6 +103,8 @@ #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define MSR_IA32_POWER_CTL 0x000001fc + #define MSR_IA32_MC0_CTL 0x00000400 #define MSR_IA32_MC0_STATUS 0x00000401 #define MSR_IA32_MC0_ADDR 0x00000402 @@ -272,6 +274,7 @@ #define MSR_IA32_PLATFORM_ID 0x00000017 #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_EBC_FREQUENCY_ID 0x0000002c +#define MSR_SMI_COUNT 0x00000034 #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define MSR_IA32_TSC_ADJUST 0x0000003b diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index d65464e43503..9f4bc6a1164d 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -232,6 +232,7 @@ #include <linux/acpi.h> #include <linux/syscore_ops.h> #include <linux/i8253.h> +#include <linux/cpuidle.h> #include <asm/uaccess.h> #include <asm/desc.h> @@ -360,13 +361,35 @@ struct apm_user { * idle percentage above which bios idle calls are done */ #ifdef CONFIG_APM_CPU_IDLE -#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012 #define DEFAULT_IDLE_THRESHOLD 95 #else #define DEFAULT_IDLE_THRESHOLD 100 #endif #define DEFAULT_IDLE_PERIOD (100 / 3) +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); + +static struct cpuidle_driver apm_idle_driver = { + .name = "apm_idle", + .owner = THIS_MODULE, + .en_core_tk_irqen = 1, + .states = { + { /* entry 0 is for polling */ }, + { /* entry 1 is for APM idle */ + .name = "APM", + .desc = "APM idle", + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 250, /* WAG */ + .target_residency = 500, /* WAG */ + .enter = &apm_cpu_idle + }, + }, + .state_count = 2, +}; + +static struct cpuidle_device apm_cpuidle_device; + /* * Local variables */ @@ -377,7 +400,6 @@ static struct { static int clock_slowed; static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; -static int set_pm_idle; static int suspends_pending; static int standbys_pending; static int ignore_sys_suspend; @@ -884,8 +906,6 @@ static void apm_do_busy(void) #define IDLE_CALC_LIMIT (HZ * 100) #define IDLE_LEAKY_MAX 16 -static void (*original_pm_idle)(void) __read_mostly; - /** * apm_cpu_idle - cpu idling for APM capable Linux * @@ -894,7 +914,8 @@ static void (*original_pm_idle)(void) __read_mostly; * Furthermore it calls the system default idle routine. */ -static void apm_cpu_idle(void) +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) { static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ @@ -904,7 +925,6 @@ static void apm_cpu_idle(void) unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; - WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012"); recalc: if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; @@ -950,10 +970,7 @@ recalc: break; } } - if (original_pm_idle) - original_pm_idle(); - else - default_idle(); + default_idle(); local_irq_disable(); jiffies_since_last_check = jiffies - last_jiffies; if (jiffies_since_last_check > idle_period) @@ -963,7 +980,7 @@ recalc: if (apm_idle_done) apm_do_busy(); - local_irq_enable(); + return index; } /** @@ -2381,9 +2398,9 @@ static int __init apm_init(void) if (HZ != 100) idle_period = (idle_period * HZ) / 100; if (idle_threshold < 100) { - original_pm_idle = pm_idle; - pm_idle = apm_cpu_idle; - set_pm_idle = 1; + if (!cpuidle_register_driver(&apm_idle_driver)) + if (cpuidle_register_device(&apm_cpuidle_device)) + cpuidle_unregister_driver(&apm_idle_driver); } return 0; @@ -2393,15 +2410,9 @@ static void __exit apm_exit(void) { int error; - if (set_pm_idle) { - pm_idle = original_pm_idle; - /* - * We are about to unload the current idle thread pm callback - * (pm_idle), Wait for all processors to update cached/local - * copies of pm_idle before proceeding. - */ - kick_all_cpus_sync(); - } + cpuidle_unregister_device(&apm_cpuidle_device); + cpuidle_unregister_driver(&apm_idle_driver); + if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) && (apm_info.connection_version > 0x0100)) { error = apm_engage_power_management(APM_DEVICE_ALL, 0); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index aef852eac292..b11719ea2f7b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -268,13 +268,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); -#ifdef CONFIG_APM_MODULE -EXPORT_SYMBOL(pm_idle); -#endif +static void (*x86_idle)(void); #ifndef CONFIG_SMP static inline void play_dead(void) @@ -351,7 +345,7 @@ void cpu_idle(void) rcu_idle_enter(); if (cpuidle_idle_call()) - pm_idle(); + x86_idle(); rcu_idle_exit(); start_critical_timings(); @@ -399,9 +393,9 @@ EXPORT_SYMBOL(default_idle); #ifdef CONFIG_XEN bool xen_set_default_idle(void) { - bool ret = !!pm_idle; + bool ret = !!x86_idle; - pm_idle = default_idle; + x86_idle = default_idle; return ret; } @@ -499,25 +493,24 @@ static void amd_e400_idle(void) void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - if (pm_idle == poll_idle && smp_num_siblings > 1) { + if (x86_idle == poll_idle && smp_num_siblings > 1) pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); - } #endif - if (pm_idle) + if (x86_idle) return; if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ pr_info("using AMD E400 aware idle routine\n"); - pm_idle = amd_e400_idle; + x86_idle = amd_e400_idle; } else - pm_idle = default_idle; + x86_idle = default_idle; } void __init init_amd_e400_c1e_mask(void) { /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ - if (pm_idle == amd_e400_idle) + if (x86_idle == amd_e400_idle) zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); } @@ -528,7 +521,7 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { pr_info("using polling idle threads\n"); - pm_idle = poll_idle; + x86_idle = poll_idle; boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "halt")) { /* @@ -538,7 +531,7 @@ static int __init idle_setup(char *str) * To continue to load the CPU idle driver, don't touch * the boot_option_idle_override. */ - pm_idle = default_idle; + x86_idle = default_idle; boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 52a5e3a4cdc3..fc95308e9a11 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -28,19 +28,12 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -#include <linux/kernel.h> #include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> #include <linux/acpi.h> #include <linux/dmi.h> -#include <linux/moduleparam.h> -#include <linux/sched.h> /* need_resched() */ -#include <linux/pm_qos.h> +#include <linux/sched.h> /* need_resched() */ #include <linux/clockchips.h> #include <linux/cpuidle.h> -#include <linux/irqflags.h> /* * Include the apic definitions for x86 to have the APIC timer related defines @@ -52,22 +45,14 @@ #include <asm/apic.h> #endif -#include <asm/io.h> -#include <asm/uaccess.h> - #include <acpi/acpi_bus.h> #include <acpi/processor.h> -#include <asm/processor.h> #define PREFIX "ACPI: " #define ACPI_PROCESSOR_CLASS "processor" #define _COMPONENT ACPI_PROCESSOR_COMPONENT ACPI_MODULE_NAME("processor_idle"); -#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY) -#define C2_OVERHEAD 1 /* 1us */ -#define C3_OVERHEAD 1 /* 1us */ -#define PM_TIMER_TICKS_TO_US(p) (((p) * 1000)/(PM_TIMER_FREQUENCY/1000)) static unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER; module_param(max_cstate, uint, 0000); @@ -81,6 +66,8 @@ module_param(latency_factor, uint, 0644); static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device); +static struct acpi_processor_cx *acpi_cstate[CPUIDLE_STATE_MAX]; + static int disabled_by_idle_boot_param(void) { return boot_option_idle_override == IDLE_POLL || @@ -735,8 +722,7 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { struct acpi_processor *pr; - struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; - struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); + struct acpi_processor_cx *cx = acpi_cstate[index]; pr = __this_cpu_read(processors); @@ -759,8 +745,7 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, */ static int acpi_idle_play_dead(struct cpuidle_device *dev, int index) { - struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; - struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); + struct acpi_processor_cx *cx = acpi_cstate[index]; ACPI_FLUSH_CPU_CACHE(); @@ -790,8 +775,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { struct acpi_processor *pr; - struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; - struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); + struct acpi_processor_cx *cx = acpi_cstate[index]; pr = __this_cpu_read(processors); @@ -849,8 +833,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { struct acpi_processor *pr; - struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; - struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); + struct acpi_processor_cx *cx = acpi_cstate[index]; pr = __this_cpu_read(processors); @@ -942,13 +925,13 @@ struct cpuidle_driver acpi_idle_driver = { * device i.e. per-cpu data * * @pr: the ACPI processor + * @dev : the cpuidle device */ -static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr) +static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr, + struct cpuidle_device *dev) { int i, count = CPUIDLE_DRIVER_STATE_START; struct acpi_processor_cx *cx; - struct cpuidle_state_usage *state_usage; - struct cpuidle_device *dev = per_cpu(acpi_cpuidle_device, pr->id); if (!pr->flags.power_setup_done) return -EINVAL; @@ -967,7 +950,6 @@ static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr) for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) { cx = &pr->power.states[i]; - state_usage = &dev->states_usage[count]; if (!cx->valid) continue; @@ -978,8 +960,7 @@ static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr) !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) continue; #endif - - cpuidle_set_statedata(state_usage, cx); + acpi_cstate[count] = cx; count++; if (count == CPUIDLE_STATE_MAX) @@ -1103,7 +1084,7 @@ int acpi_processor_hotplug(struct acpi_processor *pr) cpuidle_disable_device(dev); acpi_processor_get_power_info(pr); if (pr->flags.power) { - acpi_processor_setup_cpuidle_cx(pr); + acpi_processor_setup_cpuidle_cx(pr, dev); ret = cpuidle_enable_device(dev); } cpuidle_resume_and_unlock(); @@ -1161,8 +1142,8 @@ int acpi_processor_cst_has_changed(struct acpi_processor *pr) continue; acpi_processor_get_power_info(_pr); if (_pr->flags.power) { - acpi_processor_setup_cpuidle_cx(_pr); dev = per_cpu(acpi_cpuidle_device, cpu); + acpi_processor_setup_cpuidle_cx(_pr, dev); cpuidle_enable_device(dev); } } @@ -1231,7 +1212,7 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr) return -ENOMEM; per_cpu(acpi_cpuidle_device, pr->id) = dev; - acpi_processor_setup_cpuidle_cx(pr); + acpi_processor_setup_cpuidle_cx(pr, dev); /* Register per-cpu cpuidle_device. Cpuidle driver * must already be registered before registering device diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 2df9414a72f7..5d6675013864 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -74,7 +74,7 @@ static struct cpuidle_driver intel_idle_driver = { .en_core_tk_irqen = 1, }; /* intel_idle.max_cstate=0 disables driver */ -static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; +static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int mwait_substates; @@ -90,6 +90,7 @@ struct idle_cpu { * Indicate which enable bits to clear here. */ unsigned long auto_demotion_disable_flags; + bool disable_promotion_to_c1e; }; static const struct idle_cpu *icpu; @@ -109,162 +110,206 @@ static struct cpuidle_state *cpuidle_state_table; #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 /* + * MWAIT takes an 8-bit "hint" in EAX "suggesting" + * the C-state (top nibble) and sub-state (bottom nibble) + * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc. + * + * We store the hint at the top of our "flags" for each state. + */ +#define flg2MWAIT(flags) (((flags) >> 24) & 0xFF) +#define MWAIT2flg(eax) ((eax & 0xFF) << 24) + +/* * States are indexed by the cstate number, * which is also the index into the MWAIT hint array. * Thus C0 is a dummy. */ -static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { - { /* MWAIT C0 */ }, - { /* MWAIT C1 */ +static struct cpuidle_state nehalem_cstates[CPUIDLE_STATE_MAX] = { + { .name = "C1-NHM", .desc = "MWAIT 0x00", - .flags = CPUIDLE_FLAG_TIME_VALID, + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, .exit_latency = 3, .target_residency = 6, .enter = &intel_idle }, - { /* MWAIT C2 */ + { + .name = "C1E-NHM", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle }, + { .name = "C3-NHM", .desc = "MWAIT 0x10", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 20, .target_residency = 80, .enter = &intel_idle }, - { /* MWAIT C3 */ + { .name = "C6-NHM", .desc = "MWAIT 0x20", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 200, .target_residency = 800, .enter = &intel_idle }, + { + .enter = NULL } }; -static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = { - { /* MWAIT C0 */ }, - { /* MWAIT C1 */ +static struct cpuidle_state snb_cstates[CPUIDLE_STATE_MAX] = { + { .name = "C1-SNB", .desc = "MWAIT 0x00", - .flags = CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 1, - .target_residency = 1, + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 2, + .target_residency = 2, .enter = &intel_idle }, - { /* MWAIT C2 */ + { + .name = "C1E-SNB", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle }, + { .name = "C3-SNB", .desc = "MWAIT 0x10", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 80, .target_residency = 211, .enter = &intel_idle }, - { /* MWAIT C3 */ + { .name = "C6-SNB", .desc = "MWAIT 0x20", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 104, .target_residency = 345, .enter = &intel_idle }, - { /* MWAIT C4 */ + { .name = "C7-SNB", .desc = "MWAIT 0x30", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 109, .target_residency = 345, .enter = &intel_idle }, + { + .enter = NULL } }; -static struct cpuidle_state ivb_cstates[MWAIT_MAX_NUM_CSTATES] = { - { /* MWAIT C0 */ }, - { /* MWAIT C1 */ +static struct cpuidle_state ivb_cstates[CPUIDLE_STATE_MAX] = { + { .name = "C1-IVB", .desc = "MWAIT 0x00", - .flags = CPUIDLE_FLAG_TIME_VALID, + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, .exit_latency = 1, .target_residency = 1, .enter = &intel_idle }, - { /* MWAIT C2 */ + { + .name = "C1E-IVB", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle }, + { .name = "C3-IVB", .desc = "MWAIT 0x10", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 59, .target_residency = 156, .enter = &intel_idle }, - { /* MWAIT C3 */ + { .name = "C6-IVB", .desc = "MWAIT 0x20", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 80, .target_residency = 300, .enter = &intel_idle }, - { /* MWAIT C4 */ + { .name = "C7-IVB", .desc = "MWAIT 0x30", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 87, .target_residency = 300, .enter = &intel_idle }, + { + .enter = NULL } }; -static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { - { /* MWAIT C0 */ }, - { /* MWAIT C1 */ - .name = "C1-ATM", +static struct cpuidle_state hsw_cstates[CPUIDLE_STATE_MAX] = { + { + .name = "C1-HSW", .desc = "MWAIT 0x00", - .flags = CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 1, - .target_residency = 4, + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle }, + { + .name = "C1E-HSW", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle }, + { + .name = "C3-HSW", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, + .target_residency = 100, + .enter = &intel_idle }, + { + .name = "C6-HSW", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, + .target_residency = 400, + .enter = &intel_idle }, + { + .name = "C7s-HSW", + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, + .target_residency = 500, + .enter = &intel_idle }, + { + .enter = NULL } +}; + +static struct cpuidle_state atom_cstates[CPUIDLE_STATE_MAX] = { + { + .name = "C1E-ATM", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 20, .enter = &intel_idle }, - { /* MWAIT C2 */ + { .name = "C2-ATM", .desc = "MWAIT 0x10", - .flags = CPUIDLE_FLAG_TIME_VALID, + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID, .exit_latency = 20, .target_residency = 80, .enter = &intel_idle }, - { /* MWAIT C3 */ }, - { /* MWAIT C4 */ + { .name = "C4-ATM", .desc = "MWAIT 0x30", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 100, .target_residency = 400, .enter = &intel_idle }, - { /* MWAIT C5 */ }, - { /* MWAIT C6 */ + { .name = "C6-ATM", .desc = "MWAIT 0x52", - .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 140, .target_residency = 560, .enter = &intel_idle }, + { + .enter = NULL } }; -static long get_driver_data(int cstate) -{ - int driver_data; - switch (cstate) { - - case 1: /* MWAIT C1 */ - driver_data = 0x00; - break; - case 2: /* MWAIT C2 */ - driver_data = 0x10; - break; - case 3: /* MWAIT C3 */ - driver_data = 0x20; - break; - case 4: /* MWAIT C4 */ - driver_data = 0x30; - break; - case 5: /* MWAIT C5 */ - driver_data = 0x40; - break; - case 6: /* MWAIT C6 */ - driver_data = 0x52; - break; - default: - driver_data = 0x00; - } - return driver_data; -} - /** * intel_idle * @dev: cpuidle_device @@ -278,8 +323,7 @@ static int intel_idle(struct cpuidle_device *dev, { unsigned long ecx = 1; /* break on interrupt flag */ struct cpuidle_state *state = &drv->states[index]; - struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; - unsigned long eax = (unsigned long)cpuidle_get_statedata(state_usage); + unsigned long eax = flg2MWAIT(state->flags); unsigned int cstate; int cpu = smp_processor_id(); @@ -362,10 +406,19 @@ static void auto_demotion_disable(void *dummy) msr_bits &= ~(icpu->auto_demotion_disable_flags); wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); } +static void c1e_promotion_disable(void *dummy) +{ + unsigned long long msr_bits; + + rdmsrl(MSR_IA32_POWER_CTL, msr_bits); + msr_bits &= ~0x2; + wrmsrl(MSR_IA32_POWER_CTL, msr_bits); +} static const struct idle_cpu idle_cpu_nehalem = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, + .disable_promotion_to_c1e = true, }; static const struct idle_cpu idle_cpu_atom = { @@ -379,10 +432,17 @@ static const struct idle_cpu idle_cpu_lincroft = { static const struct idle_cpu idle_cpu_snb = { .state_table = snb_cstates, + .disable_promotion_to_c1e = true, }; static const struct idle_cpu idle_cpu_ivb = { .state_table = ivb_cstates, + .disable_promotion_to_c1e = true, +}; + +static const struct idle_cpu idle_cpu_hsw = { + .state_table = hsw_cstates, + .disable_promotion_to_c1e = true, }; #define ICPU(model, cpu) \ @@ -402,6 +462,9 @@ static const struct x86_cpu_id intel_idle_ids[] = { ICPU(0x2d, idle_cpu_snb), ICPU(0x3a, idle_cpu_ivb), ICPU(0x3e, idle_cpu_ivb), + ICPU(0x3c, idle_cpu_hsw), + ICPU(0x3f, idle_cpu_hsw), + ICPU(0x45, idle_cpu_hsw), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); @@ -484,32 +547,31 @@ static int intel_idle_cpuidle_driver_init(void) drv->state_count = 1; - for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { - int num_substates; + for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { + int num_substates, mwait_hint, mwait_cstate, mwait_substate; - if (cstate > max_cstate) { + if (cpuidle_state_table[cstate].enter == NULL) + break; + + if (cstate + 1 > max_cstate) { printk(PREFIX "max_cstate %d reached\n", max_cstate); break; } + mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); + mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint); + mwait_substate = MWAIT_HINT2SUBSTATE(mwait_hint); + /* does the state exist in CPUID.MWAIT? */ - num_substates = (mwait_substates >> ((cstate) * 4)) + num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4)) & MWAIT_SUBSTATE_MASK; - if (num_substates == 0) - continue; - /* is the state not enabled? */ - if (cpuidle_state_table[cstate].enter == NULL) { - /* does the driver not know about the state? */ - if (*cpuidle_state_table[cstate].name == '\0') - pr_debug(PREFIX "unaware of model 0x%x" - " MWAIT %d please" - " contact lenb@kernel.org\n", - boot_cpu_data.x86_model, cstate); + + /* if sub-state in table is not enumerated by CPUID */ + if ((mwait_substate + 1) > num_substates) continue; - } - if ((cstate > 2) && + if (((mwait_cstate + 1) > 2) && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halts in idle" " states deeper than C2"); @@ -523,6 +585,9 @@ static int intel_idle_cpuidle_driver_init(void) if (icpu->auto_demotion_disable_flags) on_each_cpu(auto_demotion_disable, NULL, 1); + if (icpu->disable_promotion_to_c1e) /* each-cpu is redundant */ + on_each_cpu(c1e_promotion_disable, NULL, 1); + return 0; } @@ -541,25 +606,28 @@ static int intel_idle_cpu_init(int cpu) dev->state_count = 1; - for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { - int num_substates; + for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { + int num_substates, mwait_hint, mwait_cstate, mwait_substate; + + if (cpuidle_state_table[cstate].enter == NULL) + continue; - if (cstate > max_cstate) { + if (cstate + 1 > max_cstate) { printk(PREFIX "max_cstate %d reached\n", max_cstate); break; } + mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); + mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint); + mwait_substate = MWAIT_HINT2SUBSTATE(mwait_hint); + /* does the state exist in CPUID.MWAIT? */ - num_substates = (mwait_substates >> ((cstate) * 4)) - & MWAIT_SUBSTATE_MASK; - if (num_substates == 0) - continue; - /* is the state not enabled? */ - if (cpuidle_state_table[cstate].enter == NULL) - continue; + num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4)) + & MWAIT_SUBSTATE_MASK; - dev->states_usage[dev->state_count].driver_data = - (void *)get_driver_data(cstate); + /* if sub-state in table is not enumerated by CPUID */ + if ((mwait_substate + 1) > num_substates) + continue; dev->state_count += 1; } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 24cd1037b6d6..480c14dc1ddd 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -32,8 +32,6 @@ struct cpuidle_driver; ****************************/ struct cpuidle_state_usage { - void *driver_data; - unsigned long long disable; unsigned long long usage; unsigned long long time; /* in US */ @@ -62,26 +60,6 @@ struct cpuidle_state { #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) -/** - * cpuidle_get_statedata - retrieves private driver state data - * @st_usage: the state usage statistics - */ -static inline void *cpuidle_get_statedata(struct cpuidle_state_usage *st_usage) -{ - return st_usage->driver_data; -} - -/** - * cpuidle_set_statedata - stores private driver state data - * @st_usage: the state usage statistics - * @data: the private data - */ -static inline void -cpuidle_set_statedata(struct cpuidle_state_usage *st_usage, void *data) -{ - st_usage->driver_data = data; -} - struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; diff --git a/include/linux/pm.h b/include/linux/pm.h index 03d7bb145311..97bcf23e045a 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -31,7 +31,6 @@ /* * Callbacks for platform drivers to implement. */ -extern void (*pm_idle)(void); extern void (*pm_power_off)(void); extern void (*pm_power_off_prepare)(void); diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 0d7dc2cfefb5..b4ddb748356c 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -31,8 +31,6 @@ The \fB-S\fP option limits output to a 1-line System Summary for each interval. .PP The \fB-v\fP option increases verbosity. .PP -The \fB-s\fP option prints the SMI counter, equivalent to "-c 0x34" -.PP The \fB-c MSR#\fP option includes the delta of the specified 32-bit MSR counter. .PP The \fB-C MSR#\fP option includes the delta of the specified 64-bit MSR counter. @@ -186,26 +184,24 @@ This is a weighted average, where the weight is %c0. ie. it is the total number un-halted cycles elapsed per time divided by the number of CPUs. .SH SMI COUNTING EXAMPLE On Intel Nehalem and newer processors, MSR 0x34 is a System Management Mode Interrupt (SMI) counter. -Using the -m option, you can display how many SMIs have fired since reset, or if there -are SMIs during the measurement interval, you can display the delta using the -d option. +This counter is shown by default under the "SMI" column. .nf -[root@x980 ~]# turbostat -m 0x34 -cor CPU %c0 GHz TSC MSR 0x034 %c1 %c3 %c6 %pc3 %pc6 - 1.41 1.82 3.38 0x00000000 8.92 37.82 51.85 17.37 0.55 - 0 0 3.73 2.03 3.38 0x00000055 1.72 48.25 46.31 17.38 0.55 - 0 6 0.14 1.63 3.38 0x00000056 5.30 - 1 2 2.51 1.80 3.38 0x00000056 15.65 29.33 52.52 - 1 8 0.10 1.65 3.38 0x00000056 18.05 - 2 4 1.16 1.68 3.38 0x00000056 5.87 24.47 68.50 - 2 10 0.10 1.63 3.38 0x00000056 6.93 - 8 1 3.84 1.91 3.38 0x00000056 1.36 50.65 44.16 - 8 7 0.08 1.64 3.38 0x00000056 5.12 - 9 3 1.82 1.73 3.38 0x00000056 7.59 24.21 66.38 - 9 9 0.09 1.68 3.38 0x00000056 9.32 - 10 5 1.66 1.65 3.38 0x00000056 15.10 50.00 33.23 - 10 11 1.72 1.65 3.38 0x00000056 15.05 +[root@x980 ~]# turbostat +cor CPU %c0 GHz TSC SMI %c1 %c3 %c6 CTMP %pc3 %pc6 + 0.11 1.91 3.38 0 1.84 0.26 97.79 29 0.82 83.87 + 0 0 0.40 1.63 3.38 0 10.27 0.12 89.20 20 0.82 83.88 + 0 6 0.06 1.63 3.38 0 10.61 + 1 2 0.37 2.63 3.38 0 0.02 0.10 99.51 22 + 1 8 0.01 1.62 3.38 0 0.39 + 2 4 0.07 1.62 3.38 0 0.04 0.07 99.82 23 + 2 10 0.02 1.62 3.38 0 0.09 + 8 1 0.23 1.64 3.38 0 0.10 1.07 98.60 24 + 8 7 0.02 1.64 3.38 0 0.31 + 9 3 0.03 1.62 3.38 0 0.03 0.05 99.89 29 + 9 9 0.02 1.62 3.38 0 0.05 + 10 5 0.07 1.62 3.38 0 0.08 0.12 99.73 27 + 10 11 0.03 1.62 3.38 0 0.13 ^C -[root@x980 ~]# .fi .SH NOTES diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ce6d46038f74..6f3214ed4444 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -58,6 +58,7 @@ unsigned int extra_msr_offset32; unsigned int extra_msr_offset64; unsigned int extra_delta_offset32; unsigned int extra_delta_offset64; +int do_smi; double bclk; unsigned int show_pkg; unsigned int show_core; @@ -99,6 +100,7 @@ struct thread_data { unsigned long long extra_delta64; unsigned long long extra_msr32; unsigned long long extra_delta32; + unsigned int smi_count; unsigned int cpu_id; unsigned int flags; #define CPU_IS_FIRST_THREAD_IN_CORE 0x2 @@ -248,6 +250,8 @@ void print_header(void) if (has_aperf) outp += sprintf(outp, " GHz"); outp += sprintf(outp, " TSC"); + if (do_smi) + outp += sprintf(outp, " SMI"); if (extra_delta_offset32) outp += sprintf(outp, " count 0x%03X", extra_delta_offset32); if (extra_delta_offset64) @@ -314,6 +318,8 @@ int dump_counters(struct thread_data *t, struct core_data *c, extra_msr_offset32, t->extra_msr32); fprintf(stderr, "msr0x%x: %016llX\n", extra_msr_offset64, t->extra_msr64); + if (do_smi) + fprintf(stderr, "SMI: %08X\n", t->smi_count); } if (c) { @@ -352,6 +358,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, * RAM_W: %5.2 * GHz: "GHz" 3 columns %3.2 * TSC: "TSC" 3 columns %3.2 + * SMI: "SMI" 4 columns %4d * percentage " %pc3" %6.2 * Perf Status percentage: %5.2 * "CTMP" 4 columns %4d @@ -431,6 +438,10 @@ int format_counters(struct thread_data *t, struct core_data *c, /* TSC */ outp += sprintf(outp, "%5.2f", 1.0 * t->tsc/units/interval_float); + /* SMI */ + if (do_smi) + outp += sprintf(outp, "%4d", t->smi_count); + /* delta */ if (extra_delta_offset32) outp += sprintf(outp, " %11llu", t->extra_delta32); @@ -645,6 +656,9 @@ delta_thread(struct thread_data *new, struct thread_data *old, */ old->extra_msr32 = new->extra_msr32; old->extra_msr64 = new->extra_msr64; + + if (do_smi) + old->smi_count = new->smi_count - old->smi_count; } int delta_cpu(struct thread_data *t, struct core_data *c, @@ -672,6 +686,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data t->mperf = 0; t->c1 = 0; + t->smi_count = 0; t->extra_delta32 = 0; t->extra_delta64 = 0; @@ -802,6 +817,11 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -4; } + if (do_smi) { + if (get_msr(cpu, MSR_SMI_COUNT, &msr)) + return -5; + t->smi_count = msr & 0xFFFFFFFF; + } if (extra_delta_offset32) { if (get_msr(cpu, extra_delta_offset32, &msr)) return -5; @@ -908,8 +928,7 @@ void print_verbose_header(void) get_msr(0, MSR_NHM_PLATFORM_INFO, &msr); - if (verbose) - fprintf(stderr, "cpu0: MSR_NHM_PLATFORM_INFO: 0x%08llx\n", msr); + fprintf(stderr, "cpu0: MSR_NHM_PLATFORM_INFO: 0x%08llx\n", msr); ratio = (msr >> 40) & 0xFF; fprintf(stderr, "%d * %.0f = %.0f MHz max efficiency\n", @@ -919,13 +938,16 @@ void print_verbose_header(void) fprintf(stderr, "%d * %.0f = %.0f MHz TSC frequency\n", ratio, bclk, ratio * bclk); + get_msr(0, MSR_IA32_POWER_CTL, &msr); + fprintf(stderr, "cpu0: MSR_IA32_POWER_CTL: 0x%08llx (C1E: %sabled)\n", + msr, msr & 0x2 ? "EN" : "DIS"); + if (!do_ivt_turbo_ratio_limit) goto print_nhm_turbo_ratio_limits; get_msr(0, MSR_IVT_TURBO_RATIO_LIMIT, &msr); - if (verbose) - fprintf(stderr, "cpu0: MSR_IVT_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); + fprintf(stderr, "cpu0: MSR_IVT_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); ratio = (msr >> 56) & 0xFF; if (ratio) @@ -1016,8 +1038,7 @@ print_nhm_turbo_ratio_limits: get_msr(0, MSR_NHM_TURBO_RATIO_LIMIT, &msr); - if (verbose) - fprintf(stderr, "cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); + fprintf(stderr, "cpu0: MSR_NHM_TURBO_RATIO_LIMIT: 0x%08llx\n", msr); ratio = (msr >> 56) & 0xFF; if (ratio) @@ -1397,6 +1418,9 @@ int has_nehalem_turbo_ratio_limit(unsigned int family, unsigned int model) case 0x2D: /* SNB Xeon */ case 0x3A: /* IVB */ case 0x3E: /* IVB Xeon */ + case 0x3C: /* HSW */ + case 0x3F: /* HSW */ + case 0x45: /* HSW */ return 1; case 0x2E: /* Nehalem-EX Xeon - Beckton */ case 0x2F: /* Westmere-EX Xeon - Eagleton */ @@ -1488,6 +1512,9 @@ void rapl_probe(unsigned int family, unsigned int model) switch (model) { case 0x2A: case 0x3A: + case 0x3C: /* HSW */ + case 0x3F: /* HSW */ + case 0x45: /* HSW */ do_rapl = RAPL_PKG | RAPL_CORES | RAPL_GFX; break; case 0x2D: @@ -1724,6 +1751,9 @@ int is_snb(unsigned int family, unsigned int model) case 0x2D: case 0x3A: /* IVB */ case 0x3E: /* IVB Xeon */ + case 0x3C: /* HSW */ + case 0x3F: /* HSW */ + case 0x45: /* HSW */ return 1; } return 0; @@ -1883,6 +1913,7 @@ void check_cpuid() do_nehalem_platform_info = genuine_intel && has_invariant_tsc; do_nhm_cstates = genuine_intel; /* all Intel w/ non-stop TSC have NHM counters */ + do_smi = do_nhm_cstates; do_snb_cstates = is_snb(family, model); bclk = discover_bclk(family, model); @@ -2219,9 +2250,6 @@ void cmdline(int argc, char **argv) case 'c': sscanf(optarg, "%x", &extra_delta_offset32); break; - case 's': - extra_delta_offset32 = 0x34; /* SMI counter */ - break; case 'C': sscanf(optarg, "%x", &extra_delta_offset64); break; @@ -2248,7 +2276,7 @@ int main(int argc, char **argv) cmdline(argc, argv); if (verbose) - fprintf(stderr, "turbostat v3.0 November 23, 2012" + fprintf(stderr, "turbostat v3.2 February 11, 2013" " - Len Brown <lenb@kernel.org>\n"); turbostat_init(); |