From d65817b4e707068c2dd3e002e87c2a0294aabc2c Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 May 2015 11:37:59 -0700 Subject: perf bench futex: Support parallel waker threads The futex-wake benchmark only measures wakeups done within a single process. While this has value in its own, it does not really generate any hb->lock contention. A new benchmark 'wake-parallel' is added, by extending the futex-wake code such that we can measure parallel waker threads. The program output shows the avg per-thread latency in order to complete its share of wakeups: Run summary [PID 13474]: blocking on 512 threads (at [private] futex 0xa88668), 8 threads waking up 64 at a time. [Run 1]: Avg per-thread latency (waking 64/512 threads) in 0.6230 ms (+-15.31%) [Run 2]: Avg per-thread latency (waking 64/512 threads) in 0.5175 ms (+-29.95%) [Run 3]: Avg per-thread latency (waking 64/512 threads) in 0.7578 ms (+-18.03%) [Run 4]: Avg per-thread latency (waking 64/512 threads) in 0.8944 ms (+-12.54%) [Run 5]: Avg per-thread latency (waking 64/512 threads) in 1.1204 ms (+-23.85%) Avg per-thread latency (waking 64/512 threads) in 0.7826 ms (+-9.91%) Naturally, different combinations of numbers of blocking and waker threads will exhibit different information. Signed-off-by: Davidlohr Bueso Tested-by: Arnaldo Carvalho de Melo Cc: Davidlohr Bueso Link: http://lkml.kernel.org/r/1431110280-20231-1-git-send-email-dave@stgolabs.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-bench.txt | 3 + tools/perf/bench/Build | 1 + tools/perf/bench/bench.h | 2 + tools/perf/bench/futex-wake-parallel.c | 294 ++++++++++++++++++++++++++++++++ tools/perf/builtin-bench.c | 1 + 5 files changed, 301 insertions(+) create mode 100644 tools/perf/bench/futex-wake-parallel.c diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index f6480cbf309b..bf3d0644bf10 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -210,6 +210,9 @@ Suite for evaluating hash tables. *wake*:: Suite for evaluating wake calls. +*wake-parallel*:: +Suite for evaluating parallel wake calls. + *requeue*:: Suite for evaluating requeue calls. diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index 5ce98023d518..c3ab760e06b4 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -3,6 +3,7 @@ perf-y += sched-pipe.o perf-y += mem-memcpy.o perf-y += futex-hash.o perf-y += futex-wake.o +perf-y += futex-wake-parallel.o perf-y += futex-requeue.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 3c4dd44d45cb..70b2f718cc21 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -33,6 +33,8 @@ extern int bench_mem_memcpy(int argc, const char **argv, extern int bench_mem_memset(int argc, const char **argv, const char *prefix); extern int bench_futex_hash(int argc, const char **argv, const char *prefix); extern int bench_futex_wake(int argc, const char **argv, const char *prefix); +extern int bench_futex_wake_parallel(int argc, const char **argv, + const char *prefix); extern int bench_futex_requeue(int argc, const char **argv, const char *prefix); #define BENCH_FORMAT_DEFAULT_STR "default" diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c new file mode 100644 index 000000000000..6d8c9fa2a16c --- /dev/null +++ b/tools/perf/bench/futex-wake-parallel.c @@ -0,0 +1,294 @@ +/* + * Copyright (C) 2015 Davidlohr Bueso. + * + * Block a bunch of threads and let parallel waker threads wakeup an + * equal amount of them. The program output reflects the avg latency + * for each individual thread to service its share of work. Ultimately + * it can be used to measure futex_wake() changes. + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/stat.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "bench.h" +#include "futex.h" + +#include +#include +#include +#include + +struct thread_data { + pthread_t worker; + unsigned int nwoken; + struct timeval runtime; +}; + +static unsigned int nwakes = 1; + +/* all threads will block on the same futex -- hash bucket chaos ;) */ +static u_int32_t futex = 0; + +static pthread_t *blocked_worker; +static bool done = false, silent = false, fshared = false; +static unsigned int nblocked_threads = 0, nwaking_threads = 0; +static pthread_mutex_t thread_lock; +static pthread_cond_t thread_parent, thread_worker; +static struct stats waketime_stats, wakeup_stats; +static unsigned int ncpus, threads_starting; +static int futex_flag = 0; + +static const struct option options[] = { + OPT_UINTEGER('t', "threads", &nblocked_threads, "Specify amount of threads"), + OPT_UINTEGER('w', "nwakers", &nwaking_threads, "Specify amount of waking threads"), + OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_END() +}; + +static const char * const bench_futex_wake_parallel_usage[] = { + "perf bench futex wake-parallel ", + NULL +}; + +static void *waking_workerfn(void *arg) +{ + struct thread_data *waker = (struct thread_data *) arg; + struct timeval start, end; + + gettimeofday(&start, NULL); + + waker->nwoken = futex_wake(&futex, nwakes, futex_flag); + if (waker->nwoken != nwakes) + warnx("couldn't wakeup all tasks (%d/%d)", + waker->nwoken, nwakes); + + gettimeofday(&end, NULL); + timersub(&end, &start, &waker->runtime); + + pthread_exit(NULL); + return NULL; +} + +static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr) +{ + unsigned int i; + + pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE); + + /* create and block all threads */ + for (i = 0; i < nwaking_threads; i++) { + /* + * Thread creation order will impact per-thread latency + * as it will affect the order to acquire the hb spinlock. + * For now let the scheduler decide. + */ + if (pthread_create(&td[i].worker, &thread_attr, + waking_workerfn, (void *)&td[i])) + err(EXIT_FAILURE, "pthread_create"); + } + + for (i = 0; i < nwaking_threads; i++) + if (pthread_join(td[i].worker, NULL)) + err(EXIT_FAILURE, "pthread_join"); +} + +static void *blocked_workerfn(void *arg __maybe_unused) +{ + pthread_mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + pthread_cond_signal(&thread_parent); + pthread_cond_wait(&thread_worker, &thread_lock); + pthread_mutex_unlock(&thread_lock); + + while (1) { /* handle spurious wakeups */ + if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR) + break; + } + + pthread_exit(NULL); + return NULL; +} + +static void block_threads(pthread_t *w, pthread_attr_t thread_attr) +{ + cpu_set_t cpu; + unsigned int i; + + threads_starting = nblocked_threads; + + /* create and block all threads */ + for (i = 0; i < nblocked_threads; i++) { + CPU_ZERO(&cpu); + CPU_SET(i % ncpus, &cpu); + + if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu)) + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + + if (pthread_create(&w[i], &thread_attr, blocked_workerfn, NULL)) + err(EXIT_FAILURE, "pthread_create"); + } +} + +static void print_run(struct thread_data *waking_worker, unsigned int run_num) +{ + unsigned int i, wakeup_avg; + double waketime_avg, waketime_stddev; + struct stats __waketime_stats, __wakeup_stats; + + init_stats(&__wakeup_stats); + init_stats(&__waketime_stats); + + for (i = 0; i < nwaking_threads; i++) { + update_stats(&__waketime_stats, waking_worker[i].runtime.tv_usec); + update_stats(&__wakeup_stats, waking_worker[i].nwoken); + } + + waketime_avg = avg_stats(&__waketime_stats); + waketime_stddev = stddev_stats(&__waketime_stats); + wakeup_avg = avg_stats(&__wakeup_stats); + + printf("[Run %d]: Avg per-thread latency (waking %d/%d threads) " + "in %.4f ms (+-%.2f%%)\n", run_num + 1, wakeup_avg, + nblocked_threads, waketime_avg/1e3, + rel_stddev_stats(waketime_stddev, waketime_avg)); +} + +static void print_summary(void) +{ + unsigned int wakeup_avg; + double waketime_avg, waketime_stddev; + + waketime_avg = avg_stats(&waketime_stats); + waketime_stddev = stddev_stats(&waketime_stats); + wakeup_avg = avg_stats(&wakeup_stats); + + printf("Avg per-thread latency (waking %d/%d threads) in %.4f ms (+-%.2f%%)\n", + wakeup_avg, + nblocked_threads, + waketime_avg/1e3, + rel_stddev_stats(waketime_stddev, waketime_avg)); +} + + +static void do_run_stats(struct thread_data *waking_worker) +{ + unsigned int i; + + for (i = 0; i < nwaking_threads; i++) { + update_stats(&waketime_stats, waking_worker[i].runtime.tv_usec); + update_stats(&wakeup_stats, waking_worker[i].nwoken); + } + +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + done = true; +} + +int bench_futex_wake_parallel(int argc, const char **argv, + const char *prefix __maybe_unused) +{ + int ret = 0; + unsigned int i, j; + struct sigaction act; + pthread_attr_t thread_attr; + struct thread_data *waking_worker; + + argc = parse_options(argc, argv, options, + bench_futex_wake_parallel_usage, 0); + if (argc) { + usage_with_options(bench_futex_wake_parallel_usage, options); + exit(EXIT_FAILURE); + } + + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + ncpus = sysconf(_SC_NPROCESSORS_ONLN); + if (!nblocked_threads) + nblocked_threads = ncpus; + + /* some sanity checks */ + if (nwaking_threads > nblocked_threads || !nwaking_threads) + nwaking_threads = nblocked_threads; + + if (nblocked_threads % nwaking_threads) + errx(EXIT_FAILURE, "Must be perfectly divisible"); + /* + * Each thread will wakeup nwakes tasks in + * a single futex_wait call. + */ + nwakes = nblocked_threads/nwaking_threads; + + blocked_worker = calloc(nblocked_threads, sizeof(*blocked_worker)); + if (!blocked_worker) + err(EXIT_FAILURE, "calloc"); + + if (!fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + printf("Run summary [PID %d]: blocking on %d threads (at [%s] " + "futex %p), %d threads waking up %d at a time.\n\n", + getpid(), nblocked_threads, fshared ? "shared":"private", + &futex, nwaking_threads, nwakes); + + init_stats(&wakeup_stats); + init_stats(&waketime_stats); + + pthread_attr_init(&thread_attr); + pthread_mutex_init(&thread_lock, NULL); + pthread_cond_init(&thread_parent, NULL); + pthread_cond_init(&thread_worker, NULL); + + for (j = 0; j < bench_repeat && !done; j++) { + waking_worker = calloc(nwaking_threads, sizeof(*waking_worker)); + if (!waking_worker) + err(EXIT_FAILURE, "calloc"); + + /* create, launch & block all threads */ + block_threads(blocked_worker, thread_attr); + + /* make sure all threads are already blocked */ + pthread_mutex_lock(&thread_lock); + while (threads_starting) + pthread_cond_wait(&thread_parent, &thread_lock); + pthread_cond_broadcast(&thread_worker); + pthread_mutex_unlock(&thread_lock); + + usleep(100000); + + /* Ok, all threads are patiently blocked, start waking folks up */ + wakeup_threads(waking_worker, thread_attr); + + for (i = 0; i < nblocked_threads; i++) { + ret = pthread_join(blocked_worker[i], NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + + do_run_stats(waking_worker); + if (!silent) + print_run(waking_worker, j); + + free(waking_worker); + } + + /* cleanup & report results */ + pthread_cond_destroy(&thread_parent); + pthread_cond_destroy(&thread_worker); + pthread_mutex_destroy(&thread_lock); + pthread_attr_destroy(&thread_attr); + + print_summary(); + + free(blocked_worker); + return ret; +} diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index b9a56fa83330..b5314e452ec7 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -58,6 +58,7 @@ static struct bench mem_benchmarks[] = { static struct bench futex_benchmarks[] = { { "hash", "Benchmark for futex hash table", bench_futex_hash }, { "wake", "Benchmark for futex wake calls", bench_futex_wake }, + { "wake-parallel", "Benchmark for parallel futex wake calls", bench_futex_wake_parallel }, { "requeue", "Benchmark for futex requeue calls", bench_futex_requeue }, { "all", "Test all futex benchmarks", NULL }, { NULL, NULL, NULL } -- cgit v1.2.3