diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 12:00:13 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 12:00:13 -0700 |
commit | 9a45da9270b64b14e154093c28f746d861ab8c61 (patch) | |
tree | bbe75582da2e9d5f9cce4bc39a6cc32fcc171ee1 /kernel | |
parent | 68a32ba14177d4a21c4a9a941cf1d7aea86d436f (diff) | |
parent | 120b566d1df22a0a4543ac0e8aef875c49dd2c21 (diff) |
Merge tag 'core-rcu-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnar:
- Support for "N" as alias for last bit in bitmap parsing library (eg
using syntax like "nohz_full=2-N")
- kvfree_rcu updates
- mm_dump_obj() updates. (One of these is to mm, but was suggested by
Andrew Morton.)
- RCU callback offloading update
- Polling RCU grace-period interfaces
- Realtime-related RCU updates
- Tasks-RCU updates
- Torture-test updates
- Torture-test scripting updates
- Miscellaneous fixes
* tag 'core-rcu-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (77 commits)
rcutorture: Test start_poll_synchronize_rcu() and poll_state_synchronize_rcu()
rcu: Provide polling interfaces for Tiny RCU grace periods
torture: Fix kvm.sh --datestamp regex check
torture: Consolidate qemu-cmd duration editing into kvm-transform.sh
torture: Print proper vmlinux path for kvm-again.sh runs
torture: Make TORTURE_TRUST_MAKE available in kvm-again.sh environment
torture: Make kvm-transform.sh update jitter commands
torture: Add --duration argument to kvm-again.sh
torture: Add kvm-again.sh to rerun a previous torture-test
torture: Create a "batches" file for build reuse
torture: De-capitalize TORTURE_SUITE
torture: Make upper-case-only no-dot no-slash scenario names official
torture: Rename SRCU-t and SRCU-u to avoid lowercase characters
torture: Remove no-mpstat error message
torture: Record kvm-test-1-run.sh and kvm-test-1-run-qemu.sh PIDs
torture: Record jitter start/stop commands
torture: Extract kvm-test-1-run-qemu.sh from kvm-test-1-run.sh
torture: Record TORTURE_KCONFIG_GDB_ARG in qemu-cmd
torture: Abstract jitter.sh start/stop into scripts
rcu: Provide polling interfaces for Tree RCU grace periods
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/rcu/rcu_segcblist.c | 3 | ||||
-rw-r--r-- | kernel/rcu/rcuscale.c | 15 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 93 | ||||
-rw-r--r-- | kernel/rcu/tasks.h | 40 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 40 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 169 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 1 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 252 | ||||
-rw-r--r-- | kernel/rcu/tree_stall.h | 2 | ||||
-rw-r--r-- | kernel/softirq.c | 2 | ||||
-rw-r--r-- | kernel/torture.c | 6 |
11 files changed, 457 insertions, 166 deletions
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7f181c9675f7..aaa111237b60 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -261,8 +261,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Mark the specified rcu_segcblist structure as offloaded. This - * structure must be empty. + * Mark the specified rcu_segcblist structure as offloaded. */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 06491d5530db..dca51fe9c73f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -625,6 +625,8 @@ rcu_scale_shutdown(void *arg) torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); +torture_param(bool, kfree_rcu_test_double, false, "Do we run a kfree_rcu() double-argument scale test?"); +torture_param(bool, kfree_rcu_test_single, false, "Do we run a kfree_rcu() single-argument scale test?"); static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; @@ -644,10 +646,13 @@ kfree_scale_thread(void *arg) struct kfree_obj *alloc_ptr; u64 start_time, end_time; long long mem_begin, mem_during = 0; + bool kfree_rcu_test_both; + DEFINE_TORTURE_RANDOM(tr); VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); + kfree_rcu_test_both = (kfree_rcu_test_single == kfree_rcu_test_double); start_time = ktime_get_mono_fast_ns(); @@ -670,7 +675,15 @@ kfree_scale_thread(void *arg) if (!alloc_ptr) return -ENOMEM; - kfree_rcu(alloc_ptr, rh); + // By default kfree_rcu_test_single and kfree_rcu_test_double are + // initialized to false. If both have the same value (false or true) + // both are randomly tested, otherwise only the one with value true + // is tested. + if ((kfree_rcu_test_single && !kfree_rcu_test_double) || + (kfree_rcu_test_both && torture_random(&tr) & 0x800)) + kfree_rcu(alloc_ptr); + else + kfree_rcu(alloc_ptr, rh); } cond_resched(); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 99657ffa6688..29d2f4c647d3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -245,11 +245,11 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) -#define rcu_can_boost() 1 -#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ -#define rcu_can_boost() 0 -#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT) +# define rcu_can_boost() 1 +#else +# define rcu_can_boost() 0 +#endif #ifdef CONFIG_RCU_TRACE static u64 notrace rcu_trace_clock_local(void) @@ -494,6 +494,8 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .get_gp_state = get_state_synchronize_rcu, + .start_gp_poll = start_poll_synchronize_rcu, + .poll_gp_state = poll_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, .call = call_rcu, .cb_barrier = rcu_barrier, @@ -923,9 +925,13 @@ static void rcu_torture_enable_rt_throttle(void) static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) { + static int dbg_done; + if (end - start > test_boost_duration * HZ - HZ / 2) { VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; + if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); return true; /* failed */ } @@ -948,8 +954,8 @@ static int rcu_torture_boost(void *arg) init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { - /* Track if the test failed already in this test interval? */ - bool failed = false; + bool failed = false; // Test failed already in this test interval + bool firsttime = true; /* Increment n_rcu_torture_boosts once per boost-test */ while (!kthread_should_stop()) { @@ -975,18 +981,17 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; - call_rcu_time = jiffies; while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ smp_store_release(&rbi.inflight, 1); - call_rcu(&rbi.rcu, rcu_torture_boost_cb); + cur_ops->call(&rbi.rcu, rcu_torture_boost_cb); /* Check if the boost test failed */ - failed = failed || - rcu_torture_boost_failed(call_rcu_time, - jiffies); + if (!firsttime && !failed) + failed = rcu_torture_boost_failed(call_rcu_time, jiffies); call_rcu_time = jiffies; + firsttime = false; } if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); @@ -999,7 +1004,7 @@ static int rcu_torture_boost(void *arg) * this case the boost check would never happen in the above * loop so do another one here. */ - if (!failed && smp_load_acquire(&rbi.inflight)) + if (!firsttime && !failed && smp_load_acquire(&rbi.inflight)) rcu_torture_boost_failed(call_rcu_time, jiffies); /* @@ -1025,6 +1030,9 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); + while (smp_load_acquire(&rbi.inflight)) + schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks. + /* Clean up and exit. */ while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) { torture_shutdown_absorb("rcu_torture_boost"); @@ -1223,14 +1231,6 @@ rcu_torture_writer(void *arg) WARN_ON_ONCE(1); break; } - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && - !cur_ops->poll_gp_state(cookie), - "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", - __func__, - rcu_torture_writer_state_getname(), - rcu_torture_writer_state, - cookie, cur_ops->get_gp_state()); } WRITE_ONCE(rcu_torture_current_version, rcu_torture_current_version + 1); @@ -1589,7 +1589,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) preempt_enable(); if (cur_ops->get_gp_state && cur_ops->poll_gp_state) WARN_ONCE(cur_ops->poll_gp_state(cookie), - "%s: Cookie check 3 failed %s(%d) %lu->%lu\n", + "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", __func__, rcu_torture_writer_state_getname(), rcu_torture_writer_state, @@ -1797,7 +1797,7 @@ rcu_torture_stats_print(void) WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio - WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed + WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?) WARN_ON_ONCE(i > 1); // Too-short grace period } pr_cont("Reader Pipe: "); @@ -1861,6 +1861,45 @@ rcu_torture_stats(void *arg) torture_shutdown_absorb("rcu_torture_stats"); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_stats"); + + { + struct rcu_head *rhp; + struct kmem_cache *kcp; + static int z; + + kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); + rhp = kmem_cache_alloc(kcp, GFP_KERNEL); + pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); + pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); + mem_dump_obj(ZERO_SIZE_PTR); + pr_alert("mem_dump_obj(NULL):"); + mem_dump_obj(NULL); + pr_alert("mem_dump_obj(%px):", &rhp); + mem_dump_obj(&rhp); + pr_alert("mem_dump_obj(%px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(%px):", &rhp->func); + mem_dump_obj(&rhp->func); + pr_alert("mem_dump_obj(%px):", &z); + mem_dump_obj(&z); + kmem_cache_free(kcp, rhp); + kmem_cache_destroy(kcp); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(kmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + kfree(rhp); + rhp = vmalloc(4096); + pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(vmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + vfree(rhp); + } + return 0; } @@ -1971,8 +2010,8 @@ static int rcu_torture_stall(void *args) local_irq_disable(); else if (!stall_cpu_block) preempt_disable(); - pr_alert("rcu_torture_stall start on CPU %d.\n", - raw_smp_processor_id()); + pr_alert("%s start on CPU %d.\n", + __func__, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) if (stall_cpu_block) @@ -1983,7 +2022,7 @@ static int rcu_torture_stall(void *args) preempt_enable(); cur_ops->readunlock(idx); } - pr_alert("rcu_torture_stall end.\n"); + pr_alert("%s end.\n", __func__); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -2595,6 +2634,8 @@ static bool rcu_torture_can_boost(void) if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2) return false; + if (!cur_ops->call) + return false; prio = rcu_get_gp_kthreads_prio(); if (!prio) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index af7c19439f4e..350ebf5051f9 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -20,7 +20,7 @@ typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** - * Definition for a Tasks-RCU-like mechanism. + * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. * @cbs_head: Head of callback list. * @cbs_tail: Tail pointer for callback list. * @cbs_wq: Wait queue allowning new callback to get kthread's attention. @@ -38,7 +38,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). - * @holdout_func: This flavor's holdout-list scan function (optional). + * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. * @name: This flavor's textual name. @@ -726,6 +726,42 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace // readers can operate from idle, offline, and exception entry/exit in no // way allows rcu_preempt and rcu_sched readers to also do so. +// +// The implementation uses rcu_tasks_wait_gp(), which relies on function +// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread() +// function sets these function pointers up so that rcu_tasks_wait_gp() +// invokes these functions in this order: +// +// rcu_tasks_trace_pregp_step(): +// Initialize the count of readers and block CPU-hotplug operations. +// rcu_tasks_trace_pertask(), invoked on every non-idle task: +// Initialize per-task state and attempt to identify an immediate +// quiescent state for that task, or, failing that, attempt to +// set that task's .need_qs flag so that task's next outermost +// rcu_read_unlock_trace() will report the quiescent state (in which +// case the count of readers is incremented). If both attempts fail, +// the task is added to a "holdout" list. +// rcu_tasks_trace_postscan(): +// Initialize state and attempt to identify an immediate quiescent +// state as above (but only for idle tasks), unblock CPU-hotplug +// operations, and wait for an RCU grace period to avoid races with +// tasks that are in the process of exiting. +// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty: +// Scans the holdout list, attempting to identify a quiescent state +// for each task on the list. If there is a quiescent state, the +// corresponding task is removed from the holdout list. +// rcu_tasks_trace_postgp(): +// Wait for the count of readers do drop to zero, reporting any stalls. +// Also execute full memory barriers to maintain ordering with code +// executing after the grace period. +// +// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks. +// +// Pre-grace-period update-side code is ordered before the grace +// period via the ->cbs_lock and barriers in rcu_tasks_kthread(). +// Pre-grace-period read-side code is ordered before the grace period by +// atomic_dec_and_test() of the count of readers (for IPIed readers) and by +// scheduler context-switch ordering (for locked-down non-running readers). // The lockdep state must be outside of #ifdef to be useful. #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index aa897c3f2e92..c8a029fbb114 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -32,12 +32,14 @@ struct rcu_ctrlblk { struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ + unsigned long gp_seq; /* Grace-period counter. */ }; /* Definition for rcupdate control block. */ static struct rcu_ctrlblk rcu_ctrlblk = { .donetail = &rcu_ctrlblk.rcucblist, .curtail = &rcu_ctrlblk.rcucblist, + .gp_seq = 0 - 300UL, }; void rcu_barrier(void) @@ -56,6 +58,7 @@ void rcu_qs(void) rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; raise_softirq_irqoff(RCU_SOFTIRQ); } + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1); local_irq_restore(flags); } @@ -177,6 +180,43 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); +/* + * Return a grace-period-counter "cookie". For more information, + * see the Tree RCU header comment. + */ +unsigned long get_state_synchronize_rcu(void) +{ + return READ_ONCE(rcu_ctrlblk.gp_seq); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); + +/* + * Return a grace-period-counter "cookie" and ensure that a future grace + * period completes. For more information, see the Tree RCU header comment. + */ +unsigned long start_poll_synchronize_rcu(void) +{ + unsigned long gp_seq = get_state_synchronize_rcu(); + + if (unlikely(is_idle_task(current))) { + /* force scheduling for rcu_qs() */ + resched_cpu(0); + } + return gp_seq; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); + +/* + * Return true if the grace period corresponding to oldstate has completed + * and false otherwise. For more information, see the Tree RCU header + * comment. + */ +bool poll_state_synchronize_rcu(unsigned long oldstate) +{ + return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); + void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da6f5213fb74..8e78b2430c16 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -156,6 +156,7 @@ static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); +static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; @@ -648,7 +649,6 @@ static noinstr void rcu_eqs_enter(bool user) instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rdp = this_cpu_ptr(&rcu_data); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); @@ -1077,7 +1077,6 @@ noinstr void rcu_nmi_enter(void) } else if (!in_nmi()) { instrumentation_begin(); rcu_irq_enter_check_tick(); - instrumentation_end(); } else { instrumentation_begin(); } @@ -1672,7 +1671,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; bool need_qs; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_rdp_is_offloaded(rdp); raw_lockdep_assert_held_rcu_node(rnp); @@ -2128,7 +2127,7 @@ static void rcu_gp_cleanup(void) needgp = true; } /* Advance CBs to reduce false positives below. */ - offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + offloaded = rcu_rdp_is_offloaded(rdp); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); @@ -2327,7 +2326,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp) unsigned long flags; unsigned long mask; bool needwake = false; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2414,7 +2413,7 @@ int rcutree_dying_cpu(unsigned int cpu) blkd = !!(rnp->qsmask & rdp->grpmask); trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); return 0; } @@ -2497,7 +2496,7 @@ static void rcu_do_batch(struct rcu_data *rdp) int div; bool __maybe_unused empty; unsigned long flags; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; @@ -3066,7 +3065,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); /* Go handle any RCU core processing required. */ - if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { + if (unlikely(rcu_rdp_is_offloaded(rdp))) { __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ } else { __call_rcu_core(rdp, head, flags); @@ -3229,8 +3228,7 @@ krc_this_cpu_lock(unsigned long *flags) static inline void krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { - raw_spin_unlock(&krcp->lock); - local_irq_restore(flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline struct kvfree_rcu_bulk_data * @@ -3464,7 +3462,7 @@ static void fill_page_cache_func(struct work_struct *work) for (i = 0; i < rcu_min_cached_objs; i++) { bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NOWARN); + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); if (bnode) { raw_spin_lock_irqsave(&krcp->lock, flags); @@ -3493,37 +3491,62 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) } } +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() +// state specified by flags. If can_alloc is true, the caller must +// be schedulable and not be holding any locks or mutexes that might be +// acquired by the memory allocator or anything that it might invoke. +// Returns true if ptr was successfully recorded, else the caller must +// use a fallback. static inline bool -kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, + unsigned long *flags, void *ptr, bool can_alloc) { struct kvfree_rcu_bulk_data *bnode; int idx; - if (unlikely(!krcp->initialized)) + *krcp = krc_this_cpu_lock(flags); + if (unlikely(!(*krcp)->initialized)) return false; - lockdep_assert_held(&krcp->lock); idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bkvhead[idx] || - krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { - bnode = get_cached_bnode(krcp); - /* Switch to emergency path. */ + if (!(*krcp)->bkvhead[idx] || + (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { + bnode = get_cached_bnode(*krcp); + if (!bnode && can_alloc) { + krc_this_cpu_unlock(*krcp, *flags); + + // __GFP_NORETRY - allows a light-weight direct reclaim + // what is OK from minimizing of fallback hitting point of + // view. Apart of that it forbids any OOM invoking what is + // also beneficial since we are about to release memory soon. + // + // __GFP_NOMEMALLOC - prevents from consuming of all the + // memory reserves. Please note we have a fallback path. + // + // __GFP_NOWARN - it is supposed that an allocation can + // be failed under low memory or high memory pressure + // scenarios. + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + *krcp = krc_this_cpu_lock(flags); + } + if (!bnode) return false; /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bkvhead[idx]; + bnode->next = (*krcp)->bkvhead[idx]; /* Attach it to the head. */ - krcp->bkvhead[idx] = bnode; + (*krcp)->bkvhead[idx] = bnode; } /* Finally insert. */ - krcp->bkvhead[idx]->records - [krcp->bkvhead[idx]->nr_records++] = ptr; + (*krcp)->bkvhead[idx]->records + [(*krcp)->bkvhead[idx]->nr_records++] = ptr; return true; } @@ -3561,8 +3584,6 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) ptr = (unsigned long *) func; } - krcp = krc_this_cpu_lock(&flags); - // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. @@ -3570,12 +3591,11 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) __func__, head); // Mark as success and leave. - success = true; - goto unlock_return; + return; } kasan_record_aux_stack(ptr); - success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); @@ -3774,8 +3794,8 @@ EXPORT_SYMBOL_GPL(synchronize_rcu); * get_state_synchronize_rcu - Snapshot current RCU state * * Returns a cookie that is used by a later call to cond_synchronize_rcu() - * to determine whether or not a full grace period has elapsed in the - * meantime. + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime. */ unsigned long get_state_synchronize_rcu(void) { @@ -3789,13 +3809,76 @@ unsigned long get_state_synchronize_rcu(void) EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); /** + * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime. If the needed grace period + * is not already slated to start, notifies RCU core of the need for that + * grace period. + * + * Interrupts must be enabled for the case where it is necessary to awaken + * the grace-period kthread. + */ +unsigned long start_poll_synchronize_rcu(void) +{ + unsigned long flags; + unsigned long gp_seq = get_state_synchronize_rcu(); + bool needwake; + struct rcu_data *rdp; + struct rcu_node *rnp; + + lockdep_assert_irqs_enabled(); + local_irq_save(flags); + rdp = this_cpu_ptr(&rcu_data); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); // irqs already disabled. + needwake = rcu_start_this_gp(rnp, rdp, gp_seq); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(); + return gp_seq; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); + +/** + * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period + * + * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * + * If a full RCU grace period has elapsed since the earlier call from + * which oldstate was obtained, return @true, otherwise return @false. + * If @false is returned, it is the caller's responsibilty to invoke this + * function later on until it does return @true. Alternatively, the caller + * can explicitly wait for a grace period, for example, by passing @oldstate + * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!). + * Those needing to keep oldstate values for very long time periods + * (many hours even on 32-bit systems) should check them occasionally + * and either refresh them or set a flag indicating that the grace period + * has completed. + */ +bool poll_state_synchronize_rcu(unsigned long oldstate) +{ + if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) { + smp_mb(); /* Ensure GP ends before subsequent accesses. */ + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); + +/** * cond_synchronize_rcu - Conditionally wait for an RCU grace period * * @oldstate: return value from earlier call to get_state_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call to - * get_state_synchronize_rcu(), just return. Otherwise, invoke - * synchronize_rcu() to wait for a full grace period. + * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. + * Otherwise, invoke synchronize_rcu() to wait for a full grace period. * * Yes, this function does not take counter wrap into account. But * counter wrap is harmless. If the counter wraps, we have waited for @@ -3804,10 +3887,8 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void cond_synchronize_rcu(unsigned long oldstate) { - if (!rcu_seq_done(&rcu_state.gp_seq, oldstate)) + if (!poll_state_synchronize_rcu(oldstate)) synchronize_rcu(); - else - smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); @@ -3843,13 +3924,13 @@ static int rcu_pending(int user) return 1; /* Does this CPU have callbacks ready to invoke? */ - if (!rcu_segcblist_is_offloaded(&rdp->cblist) && + if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && - !rcu_segcblist_is_offloaded(&rdp->cblist) && + !rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; @@ -3968,7 +4049,7 @@ void rcu_barrier(void) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); if (cpu_is_offline(cpu) && - !rcu_segcblist_is_offloaded(&rdp->cblist)) + !rcu_rdp_is_offloaded(rdp)) continue; if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, @@ -4083,15 +4164,13 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + /* - * Lock in case the CB/GP kthreads are still around handling - * old callbacks (longer term we should flush all callbacks - * before completing CPU offline) + * Only non-NOCB CPUs that didn't have early-boot callbacks need to be + * (re-)initialized. */ - rcu_nocb_lock(rdp); - if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */ + if (!rcu_segcblist_is_enabled(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rcu_nocb_unlock(rdp); /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -4291,7 +4370,7 @@ void rcutree_migrate_callbacks(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); bool needwake; - if (rcu_segcblist_is_offloaded(&rdp->cblist) || + if (rcu_rdp_is_offloaded(rdp) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -4309,7 +4388,7 @@ void rcutree_migrate_callbacks(int cpu) rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); - if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { + if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); } else { diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6c6ff06d4ae6..2796084ef85a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -521,6 +521,7 @@ static void synchronize_rcu_expedited_wait(void) if (rcu_stall_is_suppressed()) continue; panic_on_rcu_stall(); + trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); ndetected = 0; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2d603771c7dc..ad0156b86937 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -16,8 +16,70 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return lockdep_is_held(&rdp->nocb_lock); +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + /* Race on early boot between thread creation and assignment */ + if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) + return true; + + if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) + if (in_task()) + return true; + return false; +} + +static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) +{ + return (timer_curr_running(&rdp->nocb_timer) && !in_irq()); +} +#else +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return 0; +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + return false; +} + +static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) +{ + return false; +} + #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) +{ + /* + * In order to read the offloaded state of an rdp is a safe + * and stable way and prevent from its value to be changed + * under us, we must either hold the barrier mutex, the cpu + * hotplug lock (read or write) or the nocb lock. Local + * non-preemptible reads are also safe. NOCB kthreads and + * timers have their own means of synchronization against the + * offloaded state updaters. + */ + RCU_LOCKDEP_WARN( + !(lockdep_is_held(&rcu_state.barrier_mutex) || + (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || + rcu_lockdep_is_held_nocb(rdp) || + (rdp == this_cpu_ptr(&rcu_data) && + !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || + rcu_current_is_nocb_kthread(rdp) || + rcu_running_nocb_timer(rdp)), + "Unsafe read of RCU_NOCB offloaded state" + ); + + return rcu_segcblist_is_offloaded(&rdp->cblist); +} + /* * Check the RCU kernel configuration parameters and print informative * messages about anything out of the ordinary. @@ -393,8 +455,9 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; + barrier(); // critical section before exit code. if (rcu_preempt_read_exit() == 0) { - barrier(); /* critical section before exit code. */ + barrier(); // critical-section exit before .s check. if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); } @@ -598,9 +661,9 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) static void rcu_read_unlock_special(struct task_struct *t) { unsigned long flags; + bool irqs_were_disabled; bool preempt_bh_were_disabled = !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); - bool irqs_were_disabled; /* NMI handlers cannot block and cannot safely manipulate state. */ if (in_nmi()) @@ -609,30 +672,33 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - bool exp; + bool expboost; // Expedited GP in flight or possible boosting. struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && - READ_ONCE(t->rcu_blocked_node->exp_tasks)) || - (rdp->grpmask & READ_ONCE(rnp->expmask)); + expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || + (rdp->grpmask & READ_ONCE(rnp->expmask)) || + IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && + t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) { + if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the - // wakeup is free or there is an expedited GP. + // wakeup is free or there is either an expedited + // GP in flight or a potential need to deboost. raise_softirq_irqoff(RCU_SOFTIRQ); } else { // Enabling BH or preempt does reschedule, so... - // Also if no expediting, slow is OK. - // Plus nohz_full CPUs eventually get tick enabled. + // Also if no expediting and no possible deboosting, + // slow is OK. Plus nohz_full CPUs eventually get + // tick enabled. set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - !rdp->defer_qs_iw_pending && exp && cpu_online(rdp->cpu)) { + expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - init_irq_work(&rdp->defer_qs_iw, - rcu_preempt_deferred_qs_handler); + init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = true; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } @@ -1257,7 +1323,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && - !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist); + !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); } /* @@ -1352,7 +1418,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ if (rcu_segcblist_empty(&rdp->cblist) || - rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) { + rcu_rdp_is_offloaded(rdp)) { *nextevt = KTIME_MAX; return 0; } @@ -1388,7 +1454,7 @@ static void rcu_prepare_for_idle(void) int tne; lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) return; /* Handle nohz enablement switches conservatively. */ @@ -1429,7 +1495,7 @@ static void rcu_cleanup_after_idle(void) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1464,14 +1530,12 @@ static void rcu_cleanup_after_idle(void) /* * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. - * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a - * comma-separated list of CPUs and/or CPU ranges. If an invalid list is - * given, a warning is emitted and all CPUs are offloaded. + * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (!strcasecmp(str, "all")) + if (!strcasecmp(str, "all")) /* legacy: use "0-N" instead */ cpumask_setall(rcu_nocb_mask); else if (cpulist_parse(str, rcu_nocb_mask)) { @@ -1494,7 +1558,7 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * After all, the main point of bypassing is to avoid lock contention * on ->nocb_lock, which only can happen at high call_rcu() rates. */ -int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; module_param(nocb_nobypass_lim_per_jiffy, int, 0); /* @@ -1560,7 +1624,7 @@ static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) static void rcu_nocb_lock(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + if (!rcu_rdp_is_offloaded(rdp)) return; raw_spin_lock(&rdp->nocb_lock); } @@ -1571,7 +1635,7 @@ static void rcu_nocb_lock(struct rcu_data *rdp) */ static void rcu_nocb_unlock(struct rcu_data *rdp) { - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (rcu_rdp_is_offloaded(rdp)) { lockdep_assert_irqs_disabled(); raw_spin_unlock(&rdp->nocb_lock); } @@ -1584,7 +1648,7 @@ static void rcu_nocb_unlock(struct rcu_data *rdp) static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, unsigned long flags) { - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (rcu_rdp_is_offloaded(rdp)) { lockdep_assert_irqs_disabled(); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } else { @@ -1596,7 +1660,7 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) lockdep_assert_held(&rdp->nocb_lock); } @@ -1641,12 +1705,16 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force, lockdep_assert_held(&rdp->nocb_lock); if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + rcu_nocb_unlock_irqrestore(rdp, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("AlreadyAwake")); - rcu_nocb_unlock_irqrestore(rdp, flags); return false; } - del_timer(&rdp->nocb_timer); + + if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&rdp->nocb_timer); + } rcu_nocb_unlock_irqrestore(rdp, flags); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { @@ -1690,7 +1758,7 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, { struct rcu_cblist rcl; - WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist)); + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); rcu_lockdep_assert_cblist_protected(rdp); lockdep_assert_held(&rdp->nocb_bypass_lock); if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { @@ -1718,7 +1786,7 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j) { - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + if (!rcu_rdp_is_offloaded(rdp)) return true; rcu_lockdep_assert_cblist_protected(rdp); rcu_nocb_bypass_lock(rdp); @@ -1732,7 +1800,7 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) { rcu_lockdep_assert_cblist_protected(rdp); - if (!rcu_segcblist_is_offloaded(&rdp->cblist) || + if (!rcu_rdp_is_offloaded(rdp) || !rcu_nocb_bypass_trylock(rdp)) return; WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); @@ -1764,11 +1832,22 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j = jiffies; long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + lockdep_assert_irqs_disabled(); + + // Pure softirq/rcuc based processing: no bypassing, no + // locking. + if (!rcu_rdp_is_offloaded(rdp)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // In the process of (de-)offloading: no bypassing, but + // locking. + if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { + rcu_nocb_lock(rdp); *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); return false; /* Not offloaded, no bypassing. */ } - lockdep_assert_irqs_disabled(); // Don't use ->nocb_bypass during early boot. if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { @@ -1878,9 +1957,9 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); if (rcu_nocb_poll || !t) { + rcu_nocb_unlock_irqrestore(rdp, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); - rcu_nocb_unlock_irqrestore(rdp, flags); return; } // Need to actually to a wakeup. @@ -1915,8 +1994,8 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, TPS("WakeOvfIsDeferred")); rcu_nocb_unlock_irqrestore(rdp, flags); } else { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } return; } @@ -1954,7 +2033,8 @@ static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) return rcu_segcblist_test_flags(&rdp->cblist, flags); } -static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state) +static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, + bool *needwake_state) { struct rcu_segcblist *cblist = &rdp->cblist; @@ -1964,7 +2044,7 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) *needwake_state = true; } - return true; + return false; } /* @@ -1975,7 +2055,7 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) *needwake_state = true; - return false; + return true; } @@ -2013,7 +2093,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); - if (!nocb_gp_update_state(rdp, &needwake_state)) { + if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_state) swake_up_one(&rdp->nocb_state_wq); @@ -2168,11 +2248,18 @@ static void nocb_cb_wait(struct rcu_data *rdp) unsigned long flags; bool needwake_state = false; bool needwake_gp = false; + bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); + /* + * Disable BH to provide the expected environment. Also, when + * transitioning to/from NOCB mode, a self-requeuing callback might + * be invoked from softirq. A short grace period could cause both + * instances of this callback would execute concurrently. + */ local_bh_disable(); rcu_do_batch(rdp); local_bh_enable(); @@ -2185,8 +2272,6 @@ static void nocb_cb_wait(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - WRITE_ONCE(rdp->nocb_cb_sleep, true); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); @@ -2194,7 +2279,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) needwake_state = true; } if (rcu_segcblist_ready_cbs(cblist)) - WRITE_ONCE(rdp->nocb_cb_sleep, false); + can_sleep = false; } else { /* * De-offloading. Clear our flag and notify the de-offload worker. @@ -2207,6 +2292,8 @@ static void nocb_cb_wait(struct rcu_data *rdp) needwake_state = true; } + WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); + if (rdp->nocb_cb_sleep) trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); @@ -2265,7 +2352,6 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp) return false; } ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); @@ -2331,24 +2417,28 @@ static int rdp_offload_toggle(struct rcu_data *rdp, return 0; } -static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +static long rcu_nocb_rdp_deoffload(void *arg) { + struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int ret; + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + pr_info("De-offloading %d\n", rdp->cpu); rcu_nocb_lock_irqsave(rdp, flags); /* - * If there are still pending work offloaded, the offline - * CPU won't help much handling them. + * Flush once and for all now. This suffices because we are + * running on the target CPU holding ->nocb_lock (thus having + * interrupts disabled), and because rdp_offload_toggle() + * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. + * Thus future calls to rcu_segcblist_completely_offloaded() will + * return false, which means that future calls to rcu_nocb_try_bypass() + * will refuse to put anything into the bypass. */ - if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - return -EBUSY; - } - + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); ret = rdp_offload_toggle(rdp, false, flags); swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | @@ -2360,30 +2450,22 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) del_timer_sync(&rdp->nocb_timer); /* - * Flush bypass. While IRQs are disabled and once we set - * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be - * enqueued on bypass. + * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked + * and IRQs disabled but let's be paranoid. */ rcu_nocb_lock_irqsave(rdp, flags); - rcu_nocb_flush_bypass(rdp, NULL, jiffies); rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); /* * With SEGCBLIST_SOFTIRQ_ONLY, we can't use - * rcu_nocb_unlock_irqrestore() anymore. Theoretically we - * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs - * disabled now, but let's be paranoid. + * rcu_nocb_unlock_irqrestore() anymore. */ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - return ret; -} + /* Sanity check */ + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); -static long rcu_nocb_rdp_deoffload(void *arg) -{ - struct rcu_data *rdp = arg; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - return __rcu_nocb_rdp_deoffload(rdp); + return ret; } int rcu_nocb_cpu_deoffload(int cpu) @@ -2397,13 +2479,15 @@ int rcu_nocb_cpu_deoffload(int cpu) } mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { - if (cpu_online(cpu)) + if (rcu_rdp_is_offloaded(rdp)) { + if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); - else - ret = __rcu_nocb_rdp_deoffload(rdp); - if (!ret) - cpumask_clear_cpu(cpu, rcu_nocb_mask); + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); + ret = -EINVAL; + } } cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); @@ -2412,12 +2496,14 @@ int rcu_nocb_cpu_deoffload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); -static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) +static long rcu_nocb_rdp_offload(void *arg) { + struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int ret; + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); /* * For now we only support re-offload, ie: the rdp must have been * offloaded on boot first. @@ -2457,14 +2543,6 @@ static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) return ret; } -static long rcu_nocb_rdp_offload(void *arg) -{ - struct rcu_data *rdp = arg; - - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - return __rcu_nocb_rdp_offload(rdp); -} - int rcu_nocb_cpu_offload(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); @@ -2472,13 +2550,15 @@ int rcu_nocb_cpu_offload(int cpu) mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { - if (cpu_online(cpu)) + if (!rcu_rdp_is_offloaded(rdp)) { + if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); - else - ret = __rcu_nocb_rdp_offload(rdp); - if (!ret) - cpumask_set_cpu(cpu, rcu_nocb_mask); + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-offload an offline CPU\n"); + ret = -EINVAL; + } } cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 475b26171b20..59b95cc5cbdf 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -536,6 +536,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ + trace_rcu_stall_warning(rcu_state.name, TPS("StallDetected")); pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -606,6 +607,7 @@ static void print_cpu_stall(unsigned long gps) * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ + trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected")); pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); print_cpu_stall_info(smp_processor_id()); diff --git a/kernel/softirq.c b/kernel/softirq.c index 5a99696da86a..4992853ef53d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -423,7 +423,7 @@ static inline void invoke_softirq(void) if (ksoftirqd_running(local_softirq_pending())) return; - if (!force_irqthreads) { + if (!force_irqthreads || !__this_cpu_read(ksoftirqd)) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if diff --git a/kernel/torture.c b/kernel/torture.c index 01e336f1e5b2..0a315c387bed 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -816,9 +816,9 @@ bool torture_init_begin(char *ttype, int v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { - pr_alert("torture_init_begin: Refusing %s init: %s running.\n", - ttype, torture_type); - pr_alert("torture_init_begin: One torture test at a time!\n"); + pr_alert("%s: Refusing %s init: %s running.\n", + __func__, ttype, torture_type); + pr_alert("%s: One torture test at a time!\n", __func__); mutex_unlock(&fullstop_mutex); return false; } |