From 87be28aaf1458445d5f648688c2eec0f13b8f3b9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:43:58 +0200 Subject: x86/asm/tsc: Replace rdtscll() with native_read_tsc() Now that the ->read_tsc() paravirt hook is gone, rdtscll() is just a wrapper around native_read_tsc(). Unwrap it. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/d2449ae62c1b1fb90195bcfb19ef4a35883a04dc.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/lib/delay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 39d6a3db0b96..9a52ad0c0758 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -100,7 +100,7 @@ void use_tsc_delay(void) int read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { - rdtscll(*timer_val); + *timer_val = native_read_tsc(); return 0; } return -1; -- cgit v1.2.3 From 9cfa1a0279e22063a727fd204a75cf3672860d83 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:00 +0200 Subject: x86/asm/tsc: Use the full 64-bit TSC in delay_tsc() As a very minor optimization, delay_tsc() was only using the low 32 bits of the TSC. It's a delay function, so just use the whole thing. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/bd1a277c71321b67c4794970cb5ace05efe21ab6.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/lib/delay.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 9a52ad0c0758..35115f3786a9 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -49,16 +49,16 @@ static void delay_loop(unsigned long loops) /* TSC based delay: */ static void delay_tsc(unsigned long __loops) { - u32 bclock, now, loops = __loops; + u64 bclock, now, loops = __loops; int cpu; preempt_disable(); cpu = smp_processor_id(); rdtsc_barrier(); - rdtscl(bclock); + bclock = native_read_tsc(); for (;;) { rdtsc_barrier(); - rdtscl(now); + now = native_read_tsc(); if ((now - bclock) >= loops) break; @@ -80,7 +80,7 @@ static void delay_tsc(unsigned long __loops) loops -= (now - bclock); cpu = smp_processor_id(); rdtsc_barrier(); - rdtscl(bclock); + bclock = native_read_tsc(); } } preempt_enable(); -- cgit v1.2.3 From 4ea1636b04dbd66536fa387bae2eea463efc705b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:07 +0200 Subject: x86/asm/tsc: Rename native_read_tsc() to rdtsc() Now that there is no paravirt TSC, the "native" is inappropriate. The function does RDTSC, so give it the obvious name: rdtsc(). Suggested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/fd43e16281991f096c1e4d21574d9e1402c62d39.1434501121.git.luto@kernel.org [ Ported it to v4.2-rc1. ] Signed-off-by: Ingo Molnar --- arch/x86/lib/delay.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 35115f3786a9..f24bc59ab0a0 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -55,10 +55,10 @@ static void delay_tsc(unsigned long __loops) preempt_disable(); cpu = smp_processor_id(); rdtsc_barrier(); - bclock = native_read_tsc(); + bclock = rdtsc(); for (;;) { rdtsc_barrier(); - now = native_read_tsc(); + now = rdtsc(); if ((now - bclock) >= loops) break; @@ -80,7 +80,7 @@ static void delay_tsc(unsigned long __loops) loops -= (now - bclock); cpu = smp_processor_id(); rdtsc_barrier(); - bclock = native_read_tsc(); + bclock = rdtsc(); } } preempt_enable(); @@ -100,7 +100,7 @@ void use_tsc_delay(void) int read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { - *timer_val = native_read_tsc(); + *timer_val = rdtsc(); return 0; } return -1; -- cgit v1.2.3 From 03b9730b769fc4d87e40f6104f4c5b2e43889f19 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:08 +0200 Subject: x86/asm/tsc: Add rdtsc_ordered() and use it in trivial call sites rdtsc_barrier(); rdtsc() is an unnecessary mouthful and requires more thought than should be necessary. Add an rdtsc_ordered() helper and replace the trivial call sites with it. This should not change generated code. The duplication of the fence asm is temporary. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/dddbf98a2af53312e9aa73a5a2b1622fe5d6f52b.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/lib/delay.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index f24bc59ab0a0..4453d52a143d 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -54,11 +54,9 @@ static void delay_tsc(unsigned long __loops) preempt_disable(); cpu = smp_processor_id(); - rdtsc_barrier(); - bclock = rdtsc(); + bclock = rdtsc_ordered(); for (;;) { - rdtsc_barrier(); - now = rdtsc(); + now = rdtsc_ordered(); if ((now - bclock) >= loops) break; @@ -79,8 +77,7 @@ static void delay_tsc(unsigned long __loops) if (unlikely(cpu != smp_processor_id())) { loops -= (now - bclock); cpu = smp_processor_id(); - rdtsc_barrier(); - bclock = rdtsc(); + bclock = rdtsc_ordered(); } } preempt_enable(); -- cgit v1.2.3 From b466bdb614823aaaa7188e85516177d2850f4782 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Mon, 10 Aug 2015 12:19:54 +0200 Subject: x86/asm/delay: Introduce an MWAITX-based delay with a configurable timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MWAITX can enable a timer and a corresponding timer value specified in SW P0 clocks. The SW P0 frequency is the same as TSC. The timer provides an upper bound on how long the instruction waits before exiting. This way, a delay function in the kernel can leverage that MWAITX timer of MWAITX. When a CPU core executes MWAITX, it will be quiesced in a waiting phase, diminishing its power consumption. This way, we can save power in comparison to our default TSC-based delays. A simple test shows that: $ cat /sys/bus/pci/devices/0000\:00\:18.4/hwmon/hwmon0/power1_acc $ sleep 10000s $ cat /sys/bus/pci/devices/0000\:00\:18.4/hwmon/hwmon0/power1_acc Results: * TSC-based default delay: 485115 uWatts average power * MWAITX-based delay: 252738 uWatts average power Thus, that's about 240 milliWatts less power consumption. The test method relies on the support of AMD CPU accumulated power algorithm in fam15h_power for which patches are forthcoming. Suggested-by: Andy Lutomirski Suggested-by: Borislav Petkov Suggested-by: Peter Zijlstra Signed-off-by: Huang Rui [ Fix delay truncation. ] Signed-off-by: Borislav Petkov Cc: Aaron Lu Cc: Andreas Herrmann Cc: Aravind Gopalakrishnan Cc: Fengguang Wu Cc: Frédéric Weisbecker Cc: H. Peter Anvin Cc: Hector Marco-Gisbert Cc: Jacob Shin Cc: Jiri Olsa Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tony Li Link: http://lkml.kernel.org/r/1438744732-1459-3-git-send-email-ray.huang@amd.com Link: http://lkml.kernel.org/r/1439201994-28067-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/lib/delay.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 4453d52a143d..e912b2f6d36e 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef CONFIG_SMP # include @@ -83,6 +84,44 @@ static void delay_tsc(unsigned long __loops) preempt_enable(); } +/* + * On some AMD platforms, MWAITX has a configurable 32-bit timer, that + * counts with TSC frequency. The input value is the loop of the + * counter, it will exit when the timer expires. + */ +static void delay_mwaitx(unsigned long __loops) +{ + u64 start, end, delay, loops = __loops; + + start = rdtsc_ordered(); + + for (;;) { + delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + + /* + * Use cpu_tss as a cacheline-aligned, seldomly + * accessed per-cpu variable as the monitor target. + */ + __monitorx(this_cpu_ptr(&cpu_tss), 0, 0); + + /* + * AMD, like Intel, supports the EAX hint and EAX=0xf + * means, do not enter any deep C-state and we use it + * here in delay() to minimize wakeup latency. + */ + __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); + + end = rdtsc_ordered(); + + if (loops <= end - start) + break; + + loops -= end - start; + + start = end; + } +} + /* * Since we calibrate only once at boot, this * function should be set once at boot and not changed @@ -91,7 +130,13 @@ static void (*delay_fn)(unsigned long) = delay_loop; void use_tsc_delay(void) { - delay_fn = delay_tsc; + if (delay_fn == delay_loop) + delay_fn = delay_tsc; +} + +void use_mwaitx_delay(void) +{ + delay_fn = delay_mwaitx; } int read_current_timer(unsigned long *timer_val) -- cgit v1.2.3