diff options
Diffstat (limited to 'kernel')
59 files changed, 2010 insertions, 1155 deletions
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 042f95534f86..68a89a9f7ccd 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -482,13 +482,21 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) prev_state = cmpxchg(&st_map->kvalue.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); - if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + switch (prev_state) { + case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); if (refcount_dec_and_test(&st_map->kvalue.refcnt)) bpf_map_put(map); + return 0; + case BPF_STRUCT_OPS_STATE_TOBEFREE: + return -EINPROGRESS; + case BPF_STRUCT_OPS_STATE_INIT: + return -ENOENT; + default: + WARN_ON_ONCE(1); + /* Should never happen. Treat it as not found. */ + return -ENOENT; } - - return 0; } static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 787140095e58..7787bdcb5d68 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2418,7 +2418,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env, struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < sizeof(int)) { + if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; @@ -4564,7 +4564,7 @@ int btf_get_info_by_fd(const struct btf *btf, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo; - struct bpf_btf_info info = {}; + struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; u32 uinfo_len; @@ -4573,6 +4573,7 @@ int btf_get_info_by_fd(const struct btf *btf, uinfo_len = attr->info.info_len; info_copy = min_t(u32, uinfo_len, sizeof(info)); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9a500fadbef5..4f1472409ef8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -227,6 +227,9 @@ cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); return -ENOMEM; @@ -302,8 +305,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; int err; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a91ad518c050..966b7b34cde0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -696,14 +696,15 @@ int bpf_get_file_flag(int flags) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. - * Return 0 on success and < 0 on error. +/* dst and src must have at least "size" number of bytes. + * Return strlen on success and < 0 on error. */ -static int bpf_obj_name_cpy(char *dst, const char *src) +int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) { - const char *end = src + BPF_OBJ_NAME_LEN; + const char *end = src + size; + const char *orig_src = src; - memset(dst, 0, BPF_OBJ_NAME_LEN); + memset(dst, 0, size); /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { if (!isalnum(*src) && @@ -712,11 +713,11 @@ static int bpf_obj_name_cpy(char *dst, const char *src) *dst++ = *src++; } - /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + /* No '\0' found in "size" number of bytes */ if (src == end) return -EINVAL; - return 0; + return src - orig_src; } int map_check_no_btf(const struct bpf_map *map, @@ -810,8 +811,9 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = bpf_obj_name_cpy(map->name, attr->map_name); - if (err) + err = bpf_obj_name_cpy(map->name, attr->map_name, + sizeof(attr->map_name)); + if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); @@ -1510,6 +1512,11 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + fdput(f); + return -ENOTSUPP; + } + mutex_lock(&map->freeze_mutex); if (map->writecnt) { @@ -2093,8 +2100,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); - err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); - if (err) + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, + sizeof(attr->prog_name)); + if (err < 0) goto free_prog; /* run eBPF verifier */ @@ -2787,7 +2795,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_prog_info info = {}; + struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_stats stats; char __user *uinsns; @@ -2799,6 +2807,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; @@ -3062,7 +3071,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, union bpf_attr __user *uattr) { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_map_info info = {}; + struct bpf_map_info info; u32 info_len = attr->info.info_len; int err; @@ -3071,6 +3080,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -3354,7 +3364,7 @@ err_put: SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { - union bpf_attr attr = {}; + union bpf_attr attr; int err; if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) @@ -3366,6 +3376,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + memset(&attr, 0, sizeof(attr)); if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index be1a1c83cdd1..f2d7cea86ffe 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -471,6 +471,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) */ p++; if (p >= end) { + (*pos)++; return NULL; } else { *pos = *p; @@ -782,7 +783,7 @@ void cgroup1_release_agent(struct work_struct *work) pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf) + if (!pathbuf || !agentbuf || !strlen(agentbuf)) goto out; spin_lock_irq(&css_set_lock); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 75f687301bbf..3dead0416b91 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3542,21 +3542,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4400,12 +4400,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) } } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - if (!list_empty(&cset->tasks)) + if (!list_empty(&cset->tasks)) { it->task_pos = cset->tasks.next; - else if (!list_empty(&cset->mg_tasks)) + it->cur_tasks_head = &cset->tasks; + } else if (!list_empty(&cset->mg_tasks)) { it->task_pos = cset->mg_tasks.next; - else + it->cur_tasks_head = &cset->mg_tasks; + } else { it->task_pos = cset->dying_tasks.next; + it->cur_tasks_head = &cset->dying_tasks; + } it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; @@ -4463,10 +4467,14 @@ repeat: else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) + if (it->task_pos == it->tasks_head) { it->task_pos = it->mg_tasks_head->next; - if (it->task_pos == it->mg_tasks_head) + it->cur_tasks_head = it->mg_tasks_head; + } + if (it->task_pos == it->mg_tasks_head) { it->task_pos = it->dying_tasks_head->next; + it->cur_tasks_head = it->dying_tasks_head; + } if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { @@ -4485,11 +4493,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (!atomic_read(&task->signal->live)) + if (it->cur_tasks_head == it->dying_tasks_head && + !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (task->flags & PF_EXITING) + if (it->cur_tasks_head == it->dying_tasks_head) goto repeat; } } @@ -4595,6 +4604,9 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; + if (pos) + (*pos)++; + return css_task_iter_next(it); } @@ -4610,7 +4622,7 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, * from position 0, so we can simply keep iterating on !0 *pos. */ if (!it) { - if (WARN_ON_ONCE((*pos)++)) + if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); it = kzalloc(sizeof(*it), GFP_KERNEL); @@ -4618,10 +4630,11 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, return ERR_PTR(-ENOMEM); of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); - } else if (!(*pos)++) { + } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); - } + } else + return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } @@ -6258,6 +6271,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) return; } + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) + return; + rcu_read_lock(); while (true) { diff --git a/kernel/cpu.c b/kernel/cpu.c index 9c706af713fb..221bf6a9e98a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void) static void lockdep_acquire_cpus_lock(void) { - rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); + rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); } static void lockdep_release_cpus_lock(void) { - rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_); + rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_); } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index e453589da97c..d22e4ba59dfa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -49,6 +49,7 @@ #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> +#include <linux/min_heap.h> #include "internal.h" @@ -891,6 +892,47 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, rcu_read_unlock(); } +static int perf_cgroup_ensure_storage(struct perf_event *event, + struct cgroup_subsys_state *css) +{ + struct perf_cpu_context *cpuctx; + struct perf_event **storage; + int cpu, heap_size, ret = 0; + + /* + * Allow storage to have sufficent space for an iterator for each + * possibly nested cgroup plus an iterator for events with no cgroup. + */ + for (heap_size = 1; css; css = css->parent) + heap_size++; + + for_each_possible_cpu(cpu) { + cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); + if (heap_size <= cpuctx->heap_size) + continue; + + storage = kmalloc_node(heap_size * sizeof(struct perf_event *), + GFP_KERNEL, cpu_to_node(cpu)); + if (!storage) { + ret = -ENOMEM; + break; + } + + raw_spin_lock_irq(&cpuctx->ctx.lock); + if (cpuctx->heap_size < heap_size) { + swap(cpuctx->heap, storage); + if (storage == cpuctx->heap_default) + storage = NULL; + cpuctx->heap_size = heap_size; + } + raw_spin_unlock_irq(&cpuctx->ctx.lock); + + kfree(storage); + } + + return ret; +} + static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) @@ -910,6 +952,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, goto out; } + ret = perf_cgroup_ensure_storage(event, css); + if (ret) + goto out; + cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -1531,6 +1577,30 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right) if (left->cpu > right->cpu) return false; +#ifdef CONFIG_CGROUP_PERF + if (left->cgrp != right->cgrp) { + if (!left->cgrp || !left->cgrp->css.cgroup) { + /* + * Left has no cgroup but right does, no cgroups come + * first. + */ + return true; + } + if (!right->cgrp || !right->cgrp->css.cgroup) { + /* + * Right has no cgroup but left does, no cgroups come + * first. + */ + return false; + } + /* Two dissimilar cgroups, order by id. */ + if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id) + return true; + + return false; + } +#endif + if (left->group_index < right->group_index) return true; if (left->group_index > right->group_index) @@ -1610,25 +1680,48 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Get the leftmost event in the @cpu subtree. + * Get the leftmost event in the cpu/cgroup subtree. */ static struct perf_event * -perf_event_groups_first(struct perf_event_groups *groups, int cpu) +perf_event_groups_first(struct perf_event_groups *groups, int cpu, + struct cgroup *cgrp) { struct perf_event *node_event = NULL, *match = NULL; struct rb_node *node = groups->tree.rb_node; +#ifdef CONFIG_CGROUP_PERF + u64 node_cgrp_id, cgrp_id = 0; + + if (cgrp) + cgrp_id = cgrp->kn->id; +#endif while (node) { node_event = container_of(node, struct perf_event, group_node); if (cpu < node_event->cpu) { node = node->rb_left; - } else if (cpu > node_event->cpu) { + continue; + } + if (cpu > node_event->cpu) { node = node->rb_right; - } else { - match = node_event; + continue; + } +#ifdef CONFIG_CGROUP_PERF + node_cgrp_id = 0; + if (node_event->cgrp && node_event->cgrp->css.cgroup) + node_cgrp_id = node_event->cgrp->css.cgroup->kn->id; + + if (cgrp_id < node_cgrp_id) { node = node->rb_left; + continue; + } + if (cgrp_id > node_cgrp_id) { + node = node->rb_right; + continue; } +#endif + match = node_event; + node = node->rb_left; } return match; @@ -1641,12 +1734,26 @@ static struct perf_event * perf_event_groups_next(struct perf_event *event) { struct perf_event *next; +#ifdef CONFIG_CGROUP_PERF + u64 curr_cgrp_id = 0; + u64 next_cgrp_id = 0; +#endif next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); - if (next && next->cpu == event->cpu) - return next; + if (next == NULL || next->cpu != event->cpu) + return NULL; - return NULL; +#ifdef CONFIG_CGROUP_PERF + if (event->cgrp && event->cgrp->css.cgroup) + curr_cgrp_id = event->cgrp->css.cgroup->kn->id; + + if (next->cgrp && next->cgrp->css.cgroup) + next_cgrp_id = next->cgrp->css.cgroup->kn->id; + + if (curr_cgrp_id != next_cgrp_id) + return NULL; +#endif + return next; } /* @@ -1986,6 +2093,12 @@ static int perf_get_aux_event(struct perf_event *event, return 1; } +static inline struct list_head *get_event_list(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; +} + static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; @@ -2028,12 +2141,8 @@ static void perf_group_detach(struct perf_event *event) if (!RB_EMPTY_NODE(&event->group_node)) { add_event_to_groups(sibling, event->ctx); - if (sibling->state == PERF_EVENT_STATE_ACTIVE) { - struct list_head *list = sibling->attr.pinned ? - &ctx->pinned_active : &ctx->flexible_active; - - list_add_tail(&sibling->active_list, list); - } + if (sibling->state == PERF_EVENT_STATE_ACTIVE) + list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); @@ -2182,6 +2291,7 @@ __perf_remove_from_context(struct perf_event *event, if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; + ctx->rotate_necessary = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; @@ -2350,6 +2460,8 @@ event_sched_in(struct perf_event *event, { int ret = 0; + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) @@ -3077,12 +3189,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; - /* - * If we had been multiplexing, no rotations are necessary, now no events - * are active. - */ - ctx->rotate_necessary = 0; - perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) @@ -3092,6 +3198,13 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_FLEXIBLE) { list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) group_sched_out(event, cpuctx, ctx); + + /* + * Since we cleared EVENT_FLEXIBLE, also clear + * rotate_necessary, is will be reset by + * ctx_flexible_sched_in() when needed. + */ + ctx->rotate_necessary = 0; } perf_pmu_enable(ctx->pmu); } @@ -3388,71 +3501,103 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } -static int visit_groups_merge(struct perf_event_groups *groups, int cpu, - int (*func)(struct perf_event *, void *), void *data) +static bool perf_less_group_idx(const void *l, const void *r) { - struct perf_event **evt, *evt1, *evt2; - int ret; - - evt1 = perf_event_groups_first(groups, -1); - evt2 = perf_event_groups_first(groups, cpu); - - while (evt1 || evt2) { - if (evt1 && evt2) { - if (evt1->group_index < evt2->group_index) - evt = &evt1; - else - evt = &evt2; - } else if (evt1) { - evt = &evt1; - } else { - evt = &evt2; - } + const struct perf_event *le = l, *re = r; - ret = func(*evt, data); - if (ret) - return ret; + return le->group_index < re->group_index; +} - *evt = perf_event_groups_next(*evt); - } +static void swap_ptr(void *l, void *r) +{ + void **lp = l, **rp = r; - return 0; + swap(*lp, *rp); } -struct sched_in_data { - struct perf_event_context *ctx; - struct perf_cpu_context *cpuctx; - int can_add_hw; +static const struct min_heap_callbacks perf_min_heap = { + .elem_size = sizeof(struct perf_event *), + .less = perf_less_group_idx, + .swp = swap_ptr, }; -static int pinned_sched_in(struct perf_event *event, void *data) +static void __heap_add(struct min_heap *heap, struct perf_event *event) { - struct sched_in_data *sid = data; + struct perf_event **itrs = heap->data; - if (event->state <= PERF_EVENT_STATE_OFF) - return 0; + if (event) { + itrs[heap->nr] = event; + heap->nr++; + } +} - if (!event_filter_match(event)) - return 0; +static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, + struct perf_event_groups *groups, int cpu, + int (*func)(struct perf_event *, void *), + void *data) +{ +#ifdef CONFIG_CGROUP_PERF + struct cgroup_subsys_state *css = NULL; +#endif + /* Space for per CPU and/or any CPU event iterators. */ + struct perf_event *itrs[2]; + struct min_heap event_heap; + struct perf_event **evt; + int ret; + + if (cpuctx) { + event_heap = (struct min_heap){ + .data = cpuctx->heap, + .nr = 0, + .size = cpuctx->heap_size, + }; + + lockdep_assert_held(&cpuctx->ctx.lock); - if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - if (!group_sched_in(event, sid->cpuctx, sid->ctx)) - list_add_tail(&event->active_list, &sid->ctx->pinned_active); +#ifdef CONFIG_CGROUP_PERF + if (cpuctx->cgrp) + css = &cpuctx->cgrp->css; +#endif + } else { + event_heap = (struct min_heap){ + .data = itrs, + .nr = 0, + .size = ARRAY_SIZE(itrs), + }; + /* Events not within a CPU context may be on any CPU. */ + __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); } + evt = event_heap.data; - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); + +#ifdef CONFIG_CGROUP_PERF + for (; css; css = css->parent) + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); +#endif + + min_heapify_all(&event_heap, &perf_min_heap); + + while (event_heap.nr) { + ret = func(*evt, data); + if (ret) + return ret; + + *evt = perf_event_groups_next(*evt); + if (*evt) + min_heapify(&event_heap, 0, &perf_min_heap); + else + min_heap_pop(&event_heap, &perf_min_heap); + } return 0; } -static int flexible_sched_in(struct perf_event *event, void *data) +static int merge_sched_in(struct perf_event *event, void *data) { - struct sched_in_data *sid = data; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -3460,14 +3605,17 @@ static int flexible_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - int ret = group_sched_in(event, sid->cpuctx, sid->ctx); - if (ret) { - sid->can_add_hw = 0; - sid->ctx->rotate_necessary = 1; - return 0; - } - list_add_tail(&event->active_list, &sid->ctx->flexible_active); + if (group_can_go_on(event, cpuctx, *can_add_hw)) { + if (!group_sched_in(event, cpuctx, ctx)) + list_add_tail(&event->active_list, get_event_list(event)); + } + + if (event->state == PERF_EVENT_STATE_INACTIVE) { + if (event->attr.pinned) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + *can_add_hw = 0; + ctx->rotate_necessary = 1; } return 0; @@ -3477,30 +3625,28 @@ static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct sched_in_data sid = { - .ctx = ctx, - .cpuctx = cpuctx, - .can_add_hw = 1, - }; + int can_add_hw = 1; - visit_groups_merge(&ctx->pinned_groups, + if (ctx != &cpuctx->ctx) + cpuctx = NULL; + + visit_groups_merge(cpuctx, &ctx->pinned_groups, smp_processor_id(), - pinned_sched_in, &sid); + merge_sched_in, &can_add_hw); } static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct sched_in_data sid = { - .ctx = ctx, - .cpuctx = cpuctx, - .can_add_hw = 1, - }; + int can_add_hw = 1; - visit_groups_merge(&ctx->flexible_groups, + if (ctx != &cpuctx->ctx) + cpuctx = NULL; + + visit_groups_merge(cpuctx, &ctx->flexible_groups, smp_processor_id(), - flexible_sched_in, &sid); + merge_sched_in, &can_add_hw); } static void @@ -3841,6 +3987,12 @@ ctx_event_to_rotate(struct perf_event_context *ctx) typeof(*event), group_node); } + /* + * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() + * finds there are unschedulable events, it will set it again. + */ + ctx->rotate_necessary = 0; + return event; } @@ -6555,6 +6707,11 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } +static inline bool perf_sample_save_hw_index(struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -6643,6 +6800,8 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); + if (perf_sample_save_hw_index(event)) + perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { /* @@ -6836,6 +6995,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->br_stack) { + if (perf_sample_save_hw_index(event)) + size += sizeof(u64); + size += data->br_stack->nr * sizeof(struct perf_branch_entry); } @@ -10349,6 +10511,9 @@ skip_type: cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); __perf_mux_hrtimer_init(cpuctx, cpu); + + cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); + cpuctx->heap = cpuctx->heap_default; } got_cpu_context: @@ -10794,12 +10959,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; - if (cgroup_fd != -1) { - err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); - if (err) - goto err_ns; - } - pmu = perf_init_event(event); if (IS_ERR(pmu)) { err = PTR_ERR(pmu); @@ -10821,6 +10980,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_pmu; } + if (cgroup_fd != -1) { + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + if (err) + goto err_pmu; + } + err = exclusive_event_init(event); if (err) goto err_pmu; @@ -10881,12 +11046,12 @@ err_per_task: exclusive_event_destroy(event); err_pmu: + if (is_cgroup_event(event)) + perf_detach_cgroup(event); if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); if (event->hw.target) diff --git a/kernel/exit.c b/kernel/exit.c index 2833ffb0c211..764960fabfa1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -258,6 +258,7 @@ void rcuwait_wake_up(struct rcuwait *w) wake_up_process(task); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX @@ -619,8 +620,8 @@ static void forget_original_parent(struct task_struct *father, reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { - t->real_parent = reaper; - BUG_ON((!t->ptrace) != (t->parent == father)); + RCU_INIT_POINTER(t->real_parent, reaper); + BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) diff --git a/kernel/fork.c b/kernel/fork.c index 60a1295f4384..d90af13431c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -397,8 +397,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } @@ -1508,7 +1508,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - rcu_assign_pointer(tsk->sighand, sig); + RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; diff --git a/kernel/futex.c b/kernel/futex.c index 0cf84c8664f2..b59532862bc0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -135,8 +135,7 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers for both - * shared and private futexes in get_futex_key_refs(). + * to futex and the waiters read (see hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -331,17 +330,6 @@ static void compat_exit_robust_list(struct task_struct *curr); static inline void compat_exit_robust_list(struct task_struct *curr) { } #endif -static inline void futex_get_mm(union futex_key *key) -{ - mmgrab(key->private.mm); - /* - * Ensure futex_get_mm() implies a full barrier such that - * get_futex_key() implies a full barrier. This is relied upon - * as smp_mb(); (B), see the ordering comment above. - */ - smp_mb__after_atomic(); -} - /* * Reflects a new waiter being added to the waitqueue. */ @@ -370,6 +358,10 @@ static inline void hb_waiters_dec(struct futex_hash_bucket *hb) static inline int hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP + /* + * Full barrier (B), see the ordering comment above. + */ + smp_mb(); return atomic_read(&hb->waiters); #else return 1; @@ -385,9 +377,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { - u32 hash = jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); + return &futex_queues[hash & (futex_hashsize - 1)]; } @@ -407,70 +399,6 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } -/* - * Take a reference to the resource addressed by a key. - * Can be called while holding spinlocks. - * - */ -static void get_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) - return; - - /* - * On MMU less systems futexes are always "private" as there is no per - * process address space. We need the smp wmb nevertheless - yes, - * arch/blackfin has MMU less SMP ... - */ - if (!IS_ENABLED(CONFIG_MMU)) { - smp_mb(); /* explicit smp_mb(); (B) */ - return; - } - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies smp_mb(); (B) */ - break; - case FUT_OFF_MMSHARED: - futex_get_mm(key); /* implies smp_mb(); (B) */ - break; - default: - /* - * Private futexes do not hold reference on an inode or - * mm, therefore the only purpose of calling get_futex_key_refs - * is because we need the barrier for the lockless waiter check. - */ - smp_mb(); /* explicit smp_mb(); (B) */ - } -} - -/* - * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. This is - * a no-op for private futexes, see comment in the get - * counterpart. - */ -static void drop_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) { - /* If we're here then we tried to put a key we failed to get */ - WARN_ON_ONCE(1); - return; - } - - if (!IS_ENABLED(CONFIG_MMU)) - return; - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - iput(key->shared.inode); - break; - case FUT_OFF_MMSHARED: - mmdrop(key->private.mm); - break; - } -} - enum futex_access { FUTEX_READ, FUTEX_WRITE @@ -505,6 +433,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, return timeout; } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -517,9 +485,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, * * The key words are stored in @key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -556,7 +530,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } @@ -659,8 +632,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -692,36 +663,8 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } @@ -733,7 +676,6 @@ out: static inline void put_futex_key(union futex_key *key) { - drop_futex_key_refs(key); } /** @@ -1723,10 +1665,9 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) oparg = 1 << oparg; } - if (!access_ok(uaddr, sizeof(u32))) - return -EFAULT; - + pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); + pagefault_enable(); if (ret) return ret; @@ -1868,7 +1809,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } - get_futex_key_refs(key2); q->key = *key2; } @@ -1890,7 +1830,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - get_futex_key_refs(key); q->key = *key; __unqueue_futex(q); @@ -2001,7 +1940,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - int drop_count = 0, task_count = 0, ret; + int task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; @@ -2122,7 +2061,6 @@ retry_private: */ if (ret > 0) { WARN_ON(pi_state); - drop_count++; task_count++; /* * If we acquired the lock, then the user space value @@ -2242,7 +2180,6 @@ retry_private: * doing so. */ requeue_pi_wake_futex(this, &key2, hb2); - drop_count++; continue; } else if (ret) { /* @@ -2263,7 +2200,6 @@ retry_private: } } requeue_futex(this, hb1, hb2, &key2); - drop_count++; } /* @@ -2278,15 +2214,6 @@ out_unlock: wake_up_q(&wake_q); hb_waiters_dec(hb2); - /* - * drop_futex_key_refs() must be called outside the spinlocks. During - * the requeue we moved futex_q's from the hash bucket at key1 to the - * one at key2 and updated their key pointer. We no longer need to - * hold the references to key1. - */ - while (--drop_count >= 0) - drop_futex_key_refs(&key1); - out_put_keys: put_futex_key(&key2); out_put_key1: @@ -2416,7 +2343,6 @@ retry: ret = 1; } - drop_futex_key_refs(&q->key); return ret; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a4ace611f47f..16ee716e8291 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -145,6 +145,13 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags for_each_action_of_desc(desc, action) { irqreturn_t res; + /* + * If this IRQ would be threaded under force_irqthreads, mark it so. + */ + if (irq_settings_can_thread(desc) && + !(action->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))) + trace_hardirq_threaded(); + trace_irq_handler_entry(irq, action); res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7eee98c38f25..fe40c658f86f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,7 +323,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); - schedule_work(&desc->affinity_notify->work); + if (!schedule_work(&desc->affinity_notify->work)) { + /* Work was already scheduled, drop our extra ref */ + kref_put(&desc->affinity_notify->kref, + desc->affinity_notify->release); + } } irqd_set(data, IRQD_AFFINITY_SET); @@ -423,7 +427,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { - cancel_work_sync(&old_notify->work); + if (cancel_work_sync(&old_notify->work)) { + /* Pending work had a ref, put that one too */ + kref_put(&old_notify->kref, old_notify->release); + } kref_put(&old_notify->kref, old_notify->release); } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 828cc30774bc..48b5d1b6af4d 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -153,7 +153,9 @@ static void irq_work_run_list(struct llist_head *list) */ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + lockdep_irq_work_enter(work); work->func(work); + lockdep_irq_work_exit(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 32406ef0d6a2..0ebf9807d971 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -84,12 +84,39 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t __lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *__owner; + +static inline void lockdep_lock(void) +{ + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + arch_spin_lock(&__lock); + __owner = current; + current->lockdep_recursion++; +} + +static inline void lockdep_unlock(void) +{ + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) + return; + + current->lockdep_recursion--; + __owner = NULL; + arch_spin_unlock(&__lock); +} + +static inline bool lockdep_assert_locked(void) +{ + return DEBUG_LOCKS_WARN_ON(__owner != current); +} + static struct task_struct *lockdep_selftest_task_struct; + static int graph_lock(void) { - arch_spin_lock(&lockdep_lock); + lockdep_lock(); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -97,27 +124,15 @@ static int graph_lock(void) * dropped already) */ if (!debug_locks) { - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return 0; } - /* prevent any recursions within lockdep from causing deadlocks */ - current->lockdep_recursion++; return 1; } -static inline int graph_unlock(void) +static inline void graph_unlock(void) { - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { - /* - * The lockdep graph lock isn't locked while we expect it to - * be, we're confused now, bye! - */ - return DEBUG_LOCKS_WARN_ON(1); - } - - current->lockdep_recursion--; - arch_spin_unlock(&lockdep_lock); - return 0; + lockdep_unlock(); } /* @@ -128,7 +143,7 @@ static inline int debug_locks_off_graph_unlock(void) { int ret = debug_locks_off(); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return ret; } @@ -147,6 +162,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); #define KEYHASH_SIZE (1UL << KEYHASH_BITS) static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; +unsigned long nr_zapped_classes; #ifndef CONFIG_DEBUG_LOCKDEP static #endif @@ -377,18 +393,31 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } +/* + * Split the recrursion counter in two to readily detect 'off' vs recursion. + */ +#define LOCKDEP_RECURSION_BITS 16 +#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) +#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) + void lockdep_off(void) { - current->lockdep_recursion++; + current->lockdep_recursion += LOCKDEP_OFF; } EXPORT_SYMBOL(lockdep_off); void lockdep_on(void) { - current->lockdep_recursion--; + current->lockdep_recursion -= LOCKDEP_OFF; } EXPORT_SYMBOL(lockdep_on); +static inline void lockdep_recursion_finish(void) +{ + if (WARN_ON_ONCE(--current->lockdep_recursion)) + current->lockdep_recursion = 0; +} + void lockdep_set_selftest_task(struct task_struct *task) { lockdep_selftest_task_struct = task; @@ -575,6 +604,7 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USAGE_STATES] = "IN-NMI", }; #endif @@ -653,7 +683,9 @@ static void print_lock_name(struct lock_class *class) printk(KERN_CONT " ("); __print_lock_name(class); - printk(KERN_CONT "){%s}", usage); + printk(KERN_CONT "){%s}-{%hd:%hd}", usage, + class->wait_type_outer ?: class->wait_type_inner, + class->wait_type_inner); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -787,6 +819,7 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } +/* used from NMI context -- must be lockless */ static inline struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { @@ -1070,13 +1103,15 @@ static inline void check_data_structures(void) { } #endif /* CONFIG_DEBUG_LOCKDEP */ +static void init_chain_block_buckets(void); + /* * Initialize the lock_classes[] array elements, the free_lock_classes list * and also the delayed_free structure. */ static void init_data_structures_once(void) { - static bool ds_initialized, rcu_head_initialized; + static bool __read_mostly ds_initialized, rcu_head_initialized; int i; if (likely(rcu_head_initialized)) @@ -1100,6 +1135,7 @@ static void init_data_structures_once(void) INIT_LIST_HEAD(&lock_classes[i].locks_after); INIT_LIST_HEAD(&lock_classes[i].locks_before); } + init_chain_block_buckets(); } static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) @@ -1230,6 +1266,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) WARN_ON_ONCE(!list_empty(&class->locks_before)); WARN_ON_ONCE(!list_empty(&class->locks_after)); class->name_version = count_matching_names(class); + class->wait_type_inner = lock->wait_type_inner; + class->wait_type_outer = lock->wait_type_outer; /* * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: @@ -1469,6 +1507,8 @@ static int __bfs(struct lock_list *source_entry, struct circular_queue *cq = &lock_cq; int ret = 1; + lockdep_assert_locked(); + if (match(source_entry, data)) { *target_entry = source_entry; ret = 0; @@ -1491,8 +1531,6 @@ static int __bfs(struct lock_list *source_entry, head = get_dep_list(lock, offset); - DEBUG_LOCKS_WARN_ON(!irqs_disabled()); - list_for_each_entry_rcu(entry, head, entry) { if (!lock_accessed(entry)) { unsigned int cq_depth; @@ -1719,9 +1757,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); ret = __lockdep_count_forward_deps(&this); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); return ret; @@ -1746,9 +1784,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); ret = __lockdep_count_backward_deps(&this); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); return ret; @@ -2298,18 +2336,6 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, return 0; } -static void inc_chains(void) -{ - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } -} - #else static inline int check_irq_usage(struct task_struct *curr, @@ -2317,13 +2343,27 @@ static inline int check_irq_usage(struct task_struct *curr, { return 1; } +#endif /* CONFIG_TRACE_IRQFLAGS */ -static inline void inc_chains(void) +static void inc_chains(int irq_context) { - nr_process_chains++; + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains++; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains++; + else + nr_process_chains++; } -#endif /* CONFIG_TRACE_IRQFLAGS */ +static void dec_chains(int irq_context) +{ + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains--; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains--; + else + nr_process_chains--; +} static void print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) @@ -2622,8 +2662,235 @@ out_bug: struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS); -int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +unsigned long nr_zapped_lock_chains; +unsigned int nr_free_chain_hlocks; /* Free chain_hlocks in buckets */ +unsigned int nr_lost_chain_hlocks; /* Lost chain_hlocks */ +unsigned int nr_large_chain_blocks; /* size > MAX_CHAIN_BUCKETS */ + +/* + * The first 2 chain_hlocks entries in the chain block in the bucket + * list contains the following meta data: + * + * entry[0]: + * Bit 15 - always set to 1 (it is not a class index) + * Bits 0-14 - upper 15 bits of the next block index + * entry[1] - lower 16 bits of next block index + * + * A next block index of all 1 bits means it is the end of the list. + * + * On the unsized bucket (bucket-0), the 3rd and 4th entries contain + * the chain block size: + * + * entry[2] - upper 16 bits of the chain block size + * entry[3] - lower 16 bits of the chain block size + */ +#define MAX_CHAIN_BUCKETS 16 +#define CHAIN_BLK_FLAG (1U << 15) +#define CHAIN_BLK_LIST_END 0xFFFFU + +static int chain_block_buckets[MAX_CHAIN_BUCKETS]; + +static inline int size_to_bucket(int size) +{ + if (size > MAX_CHAIN_BUCKETS) + return 0; + + return size - 1; +} + +/* + * Iterate all the chain blocks in a bucket. + */ +#define for_each_chain_block(bucket, prev, curr) \ + for ((prev) = -1, (curr) = chain_block_buckets[bucket]; \ + (curr) >= 0; \ + (prev) = (curr), (curr) = chain_block_next(curr)) + +/* + * next block or -1 + */ +static inline int chain_block_next(int offset) +{ + int next = chain_hlocks[offset]; + + WARN_ON_ONCE(!(next & CHAIN_BLK_FLAG)); + + if (next == CHAIN_BLK_LIST_END) + return -1; + + next &= ~CHAIN_BLK_FLAG; + next <<= 16; + next |= chain_hlocks[offset + 1]; + + return next; +} + +/* + * bucket-0 only + */ +static inline int chain_block_size(int offset) +{ + return (chain_hlocks[offset + 2] << 16) | chain_hlocks[offset + 3]; +} + +static inline void init_chain_block(int offset, int next, int bucket, int size) +{ + chain_hlocks[offset] = (next >> 16) | CHAIN_BLK_FLAG; + chain_hlocks[offset + 1] = (u16)next; + + if (size && !bucket) { + chain_hlocks[offset + 2] = size >> 16; + chain_hlocks[offset + 3] = (u16)size; + } +} + +static inline void add_chain_block(int offset, int size) +{ + int bucket = size_to_bucket(size); + int next = chain_block_buckets[bucket]; + int prev, curr; + + if (unlikely(size < 2)) { + /* + * We can't store single entries on the freelist. Leak them. + * + * One possible way out would be to uniquely mark them, other + * than with CHAIN_BLK_FLAG, such that we can recover them when + * the block before it is re-added. + */ + if (size) + nr_lost_chain_hlocks++; + return; + } + + nr_free_chain_hlocks += size; + if (!bucket) { + nr_large_chain_blocks++; + + /* + * Variable sized, sort large to small. + */ + for_each_chain_block(0, prev, curr) { + if (size >= chain_block_size(curr)) + break; + } + init_chain_block(offset, curr, 0, size); + if (prev < 0) + chain_block_buckets[0] = offset; + else + init_chain_block(prev, offset, 0, 0); + return; + } + /* + * Fixed size, add to head. + */ + init_chain_block(offset, next, bucket, size); + chain_block_buckets[bucket] = offset; +} + +/* + * Only the first block in the list can be deleted. + * + * For the variable size bucket[0], the first block (the largest one) is + * returned, broken up and put back into the pool. So if a chain block of + * length > MAX_CHAIN_BUCKETS is ever used and zapped, it will just be + * queued up after the primordial chain block and never be used until the + * hlock entries in the primordial chain block is almost used up. That + * causes fragmentation and reduce allocation efficiency. That can be + * monitored by looking at the "large chain blocks" number in lockdep_stats. + */ +static inline void del_chain_block(int bucket, int size, int next) +{ + nr_free_chain_hlocks -= size; + chain_block_buckets[bucket] = next; + + if (!bucket) + nr_large_chain_blocks--; +} + +static void init_chain_block_buckets(void) +{ + int i; + + for (i = 0; i < MAX_CHAIN_BUCKETS; i++) + chain_block_buckets[i] = -1; + + add_chain_block(0, ARRAY_SIZE(chain_hlocks)); +} + +/* + * Return offset of a chain block of the right size or -1 if not found. + * + * Fairly simple worst-fit allocator with the addition of a number of size + * specific free lists. + */ +static int alloc_chain_hlocks(int req) +{ + int bucket, curr, size; + + /* + * We rely on the MSB to act as an escape bit to denote freelist + * pointers. Make sure this bit isn't set in 'normal' class_idx usage. + */ + BUILD_BUG_ON((MAX_LOCKDEP_KEYS-1) & CHAIN_BLK_FLAG); + + init_data_structures_once(); + + if (nr_free_chain_hlocks < req) + return -1; + + /* + * We require a minimum of 2 (u16) entries to encode a freelist + * 'pointer'. + */ + req = max(req, 2); + bucket = size_to_bucket(req); + curr = chain_block_buckets[bucket]; + + if (bucket) { + if (curr >= 0) { + del_chain_block(bucket, req, chain_block_next(curr)); + return curr; + } + /* Try bucket 0 */ + curr = chain_block_buckets[0]; + } + + /* + * The variable sized freelist is sorted by size; the first entry is + * the largest. Use it if it fits. + */ + if (curr >= 0) { + size = chain_block_size(curr); + if (likely(size >= req)) { + del_chain_block(0, size, chain_block_next(curr)); + add_chain_block(curr + req, size - req); + return curr; + } + } + + /* + * Last resort, split a block in a larger sized bucket. + */ + for (size = MAX_CHAIN_BUCKETS; size > req; size--) { + bucket = size_to_bucket(size); + curr = chain_block_buckets[bucket]; + if (curr < 0) + continue; + + del_chain_block(bucket, size, chain_block_next(curr)); + add_chain_block(curr + req, size - req); + return curr; + } + + return -1; +} + +static inline void free_chain_hlocks(int base, int size) +{ + add_chain_block(base, max(size, 2)); +} struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { @@ -2803,7 +3070,7 @@ static inline int add_chain_cache(struct task_struct *curr, * disabled to make this an IRQ-safe lock.. for recursion reasons * lockdep won't complain about its own locking errors. */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (lockdep_assert_locked()) return 0; chain = alloc_lock_chain(); @@ -2824,15 +3091,8 @@ static inline int add_chain_cache(struct task_struct *curr, BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks)); BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes)); - if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = nr_chain_hlocks; - for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx; - chain_hlocks[chain->base + j] = lock_id; - } - chain_hlocks[chain->base + j] = class - lock_classes; - nr_chain_hlocks += chain->depth; - } else { + j = alloc_chain_hlocks(chain->depth); + if (j < 0) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2841,9 +3101,16 @@ static inline int add_chain_cache(struct task_struct *curr, return 0; } + chain->base = j; + for (j = 0; j < chain->depth - 1; j++, i++) { + int lock_id = curr->held_locks[i].class_idx; + + chain_hlocks[chain->base + j] = lock_id; + } + chain_hlocks[chain->base + j] = class - lock_classes; hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); - inc_chains(); + inc_chains(chain->irq_context); return 1; } @@ -2987,6 +3254,8 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } + +static void init_chain_block_buckets(void) { } #endif /* CONFIG_PROVE_LOCKING */ /* @@ -3429,9 +3698,9 @@ void lockdep_hardirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; - current->lockdep_recursion = 1; + current->lockdep_recursion++; __trace_hardirqs_on_caller(ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); } NOKPROBE_SYMBOL(lockdep_hardirqs_on); @@ -3487,7 +3756,7 @@ void trace_softirqs_on(unsigned long ip) return; } - current->lockdep_recursion = 1; + current->lockdep_recursion++; /* * We'll do an OFF -> ON transition: */ @@ -3502,7 +3771,7 @@ void trace_softirqs_on(unsigned long ip) */ if (curr->hardirqs_enabled) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); } /* @@ -3596,7 +3865,8 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return 2 * !!task->hardirq_context + !!task->softirq_context; + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } static int separate_irq_context(struct task_struct *curr, @@ -3682,6 +3952,113 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +static int +print_lock_invalid_wait_context(struct task_struct *curr, + struct held_lock *hlock) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + pr_warn("\n"); + pr_warn("=============================\n"); + pr_warn("[ BUG: Invalid wait context ]\n"); + print_kernel_ident(); + pr_warn("-----------------------------\n"); + + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + print_lock(hlock); + + pr_warn("other info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + pr_warn("stack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Verify the wait_type context. + * + * This check validates we takes locks in the right wait-type order; that is it + * ensures that we do not take mutexes inside spinlocks and do not attempt to + * acquire spinlocks inside raw_spinlocks and the sort. + * + * The entire thing is slightly more complex because of RCU, RCU is a lock that + * can be taken from (pretty much) any context but also has constraints. + * However when taken in a stricter environment the RCU lock does not loosen + * the constraints. + * + * Therefore we must look for the strictest environment in the lock stack and + * compare that to the lock we're trying to acquire. + */ +static int check_wait_context(struct task_struct *curr, struct held_lock *next) +{ + short next_inner = hlock_class(next)->wait_type_inner; + short next_outer = hlock_class(next)->wait_type_outer; + short curr_inner; + int depth; + + if (!curr->lockdep_depth || !next_inner || next->trylock) + return 0; + + if (!next_outer) + next_outer = next_inner; + + /* + * Find start of current irq_context.. + */ + for (depth = curr->lockdep_depth - 1; depth >= 0; depth--) { + struct held_lock *prev = curr->held_locks + depth; + if (prev->irq_context != next->irq_context) + break; + } + depth++; + + /* + * Set appropriate wait type for the context; for IRQs we have to take + * into account force_irqthread as that is implied by PREEMPT_RT. + */ + if (curr->hardirq_context) { + /* + * Check if force_irqthreads will run us threaded. + */ + if (curr->hardirq_threaded || curr->irq_config) + curr_inner = LD_WAIT_CONFIG; + else + curr_inner = LD_WAIT_SPIN; + } else if (curr->softirq_context) { + /* + * Softirqs are always threaded. + */ + curr_inner = LD_WAIT_CONFIG; + } else { + curr_inner = LD_WAIT_MAX; + } + + for (; depth < curr->lockdep_depth; depth++) { + struct held_lock *prev = curr->held_locks + depth; + short prev_inner = hlock_class(prev)->wait_type_inner; + + if (prev_inner) { + /* + * We can have a bigger inner than a previous one + * when outer is smaller than inner, as with RCU. + * + * Also due to trylocks. + */ + curr_inner = min(curr_inner, prev_inner); + } + } + + if (next_outer > curr_inner) + return print_lock_invalid_wait_context(curr, next); + + return 0; +} + #else /* CONFIG_PROVE_LOCKING */ static inline int @@ -3701,13 +4078,20 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } +static inline int check_wait_context(struct task_struct *curr, + struct held_lock *next) +{ + return 0; +} + #endif /* CONFIG_PROVE_LOCKING */ /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) +void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass, + short inner, short outer) { int i; @@ -3728,6 +4112,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; + lock->wait_type_outer = outer; + lock->wait_type_inner = inner; + /* * No key, no joy, we need to hash something. */ @@ -3755,13 +4142,13 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; register_lock_class(lock, subclass, 1); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } } -EXPORT_SYMBOL_GPL(lockdep_init_map); +EXPORT_SYMBOL_GPL(lockdep_init_map_waits); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3862,7 +4249,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes; - if (depth) { + if (depth) { /* we're holding locks */ hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (!references) @@ -3904,6 +4291,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif hlock->pin_count = pin_count; + if (check_wait_context(curr, hlock)) + return 0; + /* Initialize the lock usage bit */ if (!mark_usage(curr, hlock, check)) return 0; @@ -4139,7 +4529,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; } - lockdep_init_map(lock, name, key, 0); + lockdep_init_map_waits(lock, name, key, 0, + lock->wait_type_inner, + lock->wait_type_outer); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes; @@ -4437,11 +4829,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name, return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_set_class); @@ -4454,15 +4846,45 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_downgrade); +/* NMI context !!! */ +static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock, int subclass) +{ +#ifdef CONFIG_PROVE_LOCKING + struct lock_class *class = look_up_lock_class(lock, subclass); + + /* if it doesn't have a class (yet), it certainly hasn't been used yet */ + if (!class) + return; + + if (!(class->usage_mask & LOCK_USED)) + return; + + hlock->class_idx = class - lock_classes; + + print_usage_bug(current, hlock, LOCK_USED, LOCK_USAGE_STATES); +#endif +} + +static bool lockdep_nmi(void) +{ + if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + return false; + + if (!in_nmi()) + return false; + + return true; +} + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -4473,17 +4895,34 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(current->lockdep_recursion)) { + /* XXX allow trylock from NMI ?!? */ + if (lockdep_nmi() && !trylock) { + struct held_lock hlock; + + hlock.acquire_ip = ip; + hlock.instance = lock; + hlock.nest_lock = nest_lock; + hlock.irq_context = 2; // XXX + hlock.trylock = trylock; + hlock.read = read; + hlock.check = check; + hlock.hardirqs_off = true; + hlock.references = 0; + + verify_lock_unused(lock, &hlock, subclass); + } return; + } raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquire); @@ -4497,11 +4936,11 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_release(lock, ip); if (__lock_release(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_release); @@ -4517,9 +4956,9 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; ret = __lock_is_held(lock, read); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return ret; @@ -4538,9 +4977,9 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; cookie = __lock_pin_lock(lock); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return cookie; @@ -4557,9 +4996,9 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_repin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_repin_lock); @@ -4574,9 +5013,9 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_unpin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_unpin_lock); @@ -4712,10 +5151,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_contended(lock, ip); __lock_contended(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_contended); @@ -4732,9 +5171,9 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_acquired(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquired); @@ -4768,57 +5207,33 @@ static void remove_class_from_lock_chain(struct pending_free *pf, struct lock_class *class) { #ifdef CONFIG_PROVE_LOCKING - struct lock_chain *new_chain; - u64 chain_key; int i; for (i = chain->base; i < chain->base + chain->depth; i++) { if (chain_hlocks[i] != class - lock_classes) continue; - /* The code below leaks one chain_hlock[] entry. */ - if (--chain->depth > 0) { - memmove(&chain_hlocks[i], &chain_hlocks[i + 1], - (chain->base + chain->depth - i) * - sizeof(chain_hlocks[0])); - } /* * Each lock class occurs at most once in a lock chain so once * we found a match we can break out of this loop. */ - goto recalc; + goto free_lock_chain; } /* Since the chain has not been modified, return. */ return; -recalc: - chain_key = INITIAL_CHAIN_KEY; - for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); - if (chain->depth && chain->chain_key == chain_key) - return; +free_lock_chain: + free_chain_hlocks(chain->base, chain->depth); /* Overwrite the chain key for concurrent RCU readers. */ - WRITE_ONCE(chain->chain_key, chain_key); + WRITE_ONCE(chain->chain_key, INITIAL_CHAIN_KEY); + dec_chains(chain->irq_context); + /* * Note: calling hlist_del_rcu() from inside a * hlist_for_each_entry_rcu() loop is safe. */ hlist_del_rcu(&chain->entry); __set_bit(chain - lock_chains, pf->lock_chains_being_freed); - if (chain->depth == 0) - return; - /* - * If the modified lock chain matches an existing lock chain, drop - * the modified lock chain. - */ - if (lookup_chain_cache(chain_key)) - return; - new_chain = alloc_lock_chain(); - if (WARN_ON_ONCE(!new_chain)) { - debug_locks_off(); - return; - } - *new_chain = *chain; - hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key)); + nr_zapped_lock_chains++; #endif } @@ -4874,6 +5289,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) } remove_class_from_lock_chains(pf, class); + nr_zapped_classes++; } static void reinit_class(struct lock_class *class) @@ -4958,8 +5374,7 @@ static void free_zapped_rcu(struct rcu_head *ch) return; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - current->lockdep_recursion = 1; + lockdep_lock(); /* closed head */ pf = delayed_free.pf + (delayed_free.index ^ 1); @@ -4971,8 +5386,7 @@ static void free_zapped_rcu(struct rcu_head *ch) */ call_rcu_zapped(delayed_free.pf + delayed_free.index); - current->lockdep_recursion = 0; - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } @@ -5017,13 +5431,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) init_data_structures_once(); raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - current->lockdep_recursion = 1; + lockdep_lock(); pf = get_pending_free(); __lockdep_free_key_range(pf, start, size); call_rcu_zapped(pf); - current->lockdep_recursion = 0; - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); /* @@ -5045,10 +5457,10 @@ static void lockdep_free_key_range_imm(void *start, unsigned long size) init_data_structures_once(); raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); __lockdep_free_key_range(pf, start, size); __free_zapped_classes(pf); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } @@ -5144,10 +5556,10 @@ static void lockdep_reset_lock_imm(struct lockdep_map *lock) unsigned long flags; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); __lockdep_reset_lock(pf, lock); __free_zapped_classes(pf); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 18d85aebbb57..baca699b94e9 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -106,6 +106,12 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define STACK_TRACE_HASH_SIZE 16384 #endif +/* + * Bit definitions for lock_chain.irq_context + */ +#define LOCK_CHAIN_SOFTIRQ_CONTEXT (1 << 0) +#define LOCK_CHAIN_HARDIRQ_CONTEXT (1 << 1) + #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) @@ -124,17 +130,21 @@ extern const char *__get_key_name(const struct lockdep_subclass_key *key, struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; +extern unsigned long nr_zapped_classes; +extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_list_entries; long lockdep_next_lockchain(long i); unsigned long lock_chain_count(void); -extern int nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; extern unsigned int nr_hardirq_chains; extern unsigned int nr_softirq_chains; extern unsigned int nr_process_chains; -extern unsigned int max_lockdep_depth; +extern unsigned int nr_free_chain_hlocks; +extern unsigned int nr_lost_chain_hlocks; +extern unsigned int nr_large_chain_blocks; +extern unsigned int max_lockdep_depth; extern unsigned int max_bfs_queue_depth; #ifdef CONFIG_PROVE_LOCKING diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 231684cfc5ae..5525cd3ba0c8 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -128,15 +128,22 @@ static int lc_show(struct seq_file *m, void *v) struct lock_chain *chain = v; struct lock_class *class; int i; + static const char * const irq_strs[] = { + [0] = "0", + [LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT] = "softirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT| + LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq|softirq", + }; if (v == SEQ_START_TOKEN) { - if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS) + if (!nr_free_chain_hlocks) seq_printf(m, "(buggered) "); seq_printf(m, "all lock chains:\n"); return 0; } - seq_printf(m, "irq_context: %d\n", chain->irq_context); + seq_printf(m, "irq_context: %s\n", irq_strs[chain->irq_context]); for (i = 0; i < chain->depth; i++) { class = lock_chain_get_class(chain, i); @@ -271,8 +278,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", lock_chain_count(), MAX_LOCKDEP_CHAINS); - seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", - nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); + seq_printf(m, " dependency chain hlocks used: %11lu [max: %lu]\n", + MAX_LOCKDEP_CHAIN_HLOCKS - + (nr_free_chain_hlocks + nr_lost_chain_hlocks), + MAX_LOCKDEP_CHAIN_HLOCKS); + seq_printf(m, " dependency chain hlocks lost: %11u\n", + nr_lost_chain_hlocks); #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -336,6 +347,18 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " debug_locks: %11u\n", debug_locks); + /* + * Zappped classes and lockdep data buffers reuse statistics. + */ + seq_puts(m, "\n"); + seq_printf(m, " zapped classes: %11lu\n", + nr_zapped_classes); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " zapped lock chains: %11lu\n", + nr_zapped_lock_chains); + seq_printf(m, " large chain blocks: %11u\n", + nr_large_chain_blocks); +#endif return 0; } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 99475a66c94f..5efbfc68ce99 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -618,7 +618,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { static int lock_torture_writer(void *arg) { struct lock_stress_stats *lwsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); set_user_nice(current, MAX_NICE); @@ -655,7 +655,7 @@ static int lock_torture_writer(void *arg) static int lock_torture_reader(void *arg) { struct lock_stress_stats *lrsp = arg; - static DEFINE_TORTURE_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_reader task started"); set_user_nice(current, MAX_NICE); @@ -696,15 +696,16 @@ static void __torture_print_stats(char *page, if (statp[i].n_lock_fail) fail = true; sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_fail) - max = statp[i].n_lock_fail; - if (min > statp[i].n_lock_fail) - min = statp[i].n_lock_fail; + if (max < statp[i].n_lock_acquired) + max = statp[i].n_lock_acquired; + if (min > statp[i].n_lock_acquired) + min = statp[i].n_lock_acquired; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", write ? "Writes" : "Reads ", - sum, max, min, max / 2 > min ? "???" : "", + sum, max, min, + !onoff_interval && max / 2 > min ? "???" : "", fail, fail ? "!!!" : ""); if (fail) atomic_inc(&cxt.n_lock_torture_errors); diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 771d4ca96dda..a7276aaf2abc 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -85,7 +85,7 @@ void debug_mutex_init(struct mutex *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); #endif lock->magic = lock; } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 364d38a0c444..a008a1ba21a7 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,27 +1,29 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/atomic.h> -#include <linux/rwsem.h> #include <linux/percpu.h> +#include <linux/wait.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/errno.h> -#include "rwsem.h" - int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, - const char *name, struct lock_class_key *rwsem_key) + const char *name, struct lock_class_key *key) { sem->read_count = alloc_percpu(int); if (unlikely(!sem->read_count)) return -ENOMEM; - /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ rcu_sync_init(&sem->rss); - __init_rwsem(&sem->rw_sem, name, rwsem_key); rcuwait_init(&sem->writer); - sem->readers_block = 0; + init_waitqueue_head(&sem->waiters); + atomic_set(&sem->block, 0); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); @@ -41,73 +43,139 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem) } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) +static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { + __this_cpu_inc(*sem->read_count); + /* * Due to having preemption disabled the decrement happens on * the same CPU as the increment, avoiding the * increment-on-one-CPU-and-decrement-on-another problem. * - * If the reader misses the writer's assignment of readers_block, then - * the writer is guaranteed to see the reader's increment. + * If the reader misses the writer's assignment of sem->block, then the + * writer is guaranteed to see the reader's increment. * * Conversely, any readers that increment their sem->read_count after - * the writer looks are guaranteed to see the readers_block value, - * which in turn means that they are guaranteed to immediately - * decrement their sem->read_count, so that it doesn't matter that the - * writer missed them. + * the writer looks are guaranteed to see the sem->block value, which + * in turn means that they are guaranteed to immediately decrement + * their sem->read_count, so that it doesn't matter that the writer + * missed them. */ smp_mb(); /* A matches D */ /* - * If !readers_block the critical section starts here, matched by the + * If !sem->block the critical section starts here, matched by the * release in percpu_up_write(). */ - if (likely(!smp_load_acquire(&sem->readers_block))) + if (likely(!atomic_read_acquire(&sem->block))) + return true; + + __this_cpu_dec(*sem->read_count); + + /* Prod writer to re-evaluate readers_active_check() */ + rcuwait_wake_up(&sem->writer); + + return false; +} + +static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem) +{ + if (atomic_read(&sem->block)) + return false; + + return atomic_xchg(&sem->block, 1) == 0; +} + +static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader) +{ + if (reader) { + bool ret; + + preempt_disable(); + ret = __percpu_down_read_trylock(sem); + preempt_enable(); + + return ret; + } + return __percpu_down_write_trylock(sem); +} + +/* + * The return value of wait_queue_entry::func means: + * + * <0 - error, wakeup is terminated and the error is returned + * 0 - no wakeup, a next waiter is tried + * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive. + * + * We use EXCLUSIVE for both readers and writers to preserve FIFO order, + * and play games with the return value to allow waking multiple readers. + * + * Specifically, we wake readers until we've woken a single writer, or until a + * trylock fails. + */ +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int wake_flags, + void *key) +{ + struct task_struct *p = get_task_struct(wq_entry->private); + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; + struct percpu_rw_semaphore *sem = key; + + /* concurrent against percpu_down_write(), can get stolen */ + if (!__percpu_rwsem_trylock(sem, reader)) return 1; - /* - * Per the above comment; we still have preemption disabled and - * will thus decrement on the same CPU as we incremented. - */ - __percpu_up_read(sem); + list_del_init(&wq_entry->entry); + smp_store_release(&wq_entry->private, NULL); - if (try) - return 0; + wake_up_process(p); + put_task_struct(p); - /* - * We either call schedule() in the wait, or we'll fall through - * and reschedule on the preempt_enable() in percpu_down_read(). - */ - preempt_enable_no_resched(); + return !reader; /* wake (readers until) 1 writer */ +} + +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +{ + DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); + bool wait; + spin_lock_irq(&sem->waiters.lock); /* - * Avoid lockdep for the down/up_read() we already have them. + * Serialize against the wakeup in percpu_up_write(), if we fail + * the trylock, the wakeup must see us on the list. */ - __down_read(&sem->rw_sem); - this_cpu_inc(*sem->read_count); - __up_read(&sem->rw_sem); + wait = !__percpu_rwsem_trylock(sem, reader); + if (wait) { + wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM; + __add_wait_queue_entry_tail(&sem->waiters, &wq_entry); + } + spin_unlock_irq(&sem->waiters.lock); - preempt_disable(); - return 1; + while (wait) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!smp_load_acquire(&wq_entry.private)) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL_GPL(__percpu_down_read); -void __percpu_up_read(struct percpu_rw_semaphore *sem) +bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) { - smp_mb(); /* B matches C */ - /* - * In other words, if they see our decrement (presumably to aggregate - * zero, as that is the only time it matters) they will also see our - * critical section. - */ - __this_cpu_dec(*sem->read_count); + if (__percpu_down_read_trylock(sem)) + return true; - /* Prod writer to recheck readers_active */ - rcuwait_wake_up(&sem->writer); + if (try) + return false; + + preempt_enable(); + percpu_rwsem_wait(sem, /* .reader = */ true); + preempt_disable(); + + return true; } -EXPORT_SYMBOL_GPL(__percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ @@ -124,6 +192,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read); * zero. If this sum is zero, then it is stable due to the fact that if any * newly arriving readers increment a given counter, they will immediately * decrement that same counter. + * + * Assumes sem->block is set. */ static bool readers_active_check(struct percpu_rw_semaphore *sem) { @@ -142,32 +212,36 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) void percpu_down_write(struct percpu_rw_semaphore *sem) { + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); - down_write(&sem->rw_sem); - /* - * Notify new readers to block; up until now, and thus throughout the - * longish rcu_sync_enter() above, new readers could still come in. + * Try set sem->block; this provides writer-writer exclusion. + * Having sem->block set makes new readers block. */ - WRITE_ONCE(sem->readers_block, 1); + if (!__percpu_down_write_trylock(sem)) + percpu_rwsem_wait(sem, /* .reader = */ false); - smp_mb(); /* D matches A */ + /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ /* - * If they don't see our writer of readers_block, then we are - * guaranteed to see their sem->read_count increment, and therefore - * will wait for them. + * If they don't see our store of sem->block, then we are guaranteed to + * see their sem->read_count increment, and therefore will wait for + * them. */ - /* Wait for all now active readers to complete. */ - rcuwait_wait_event(&sem->writer, readers_active_check(sem)); + /* Wait for all active readers to complete. */ + rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *sem) { + rwsem_release(&sem->dep_map, _RET_IP_); + /* * Signal the writer is done, no fast path yet. * @@ -178,12 +252,12 @@ void percpu_up_write(struct percpu_rw_semaphore *sem) * Therefore we force it through the slow path which guarantees an * acquire and thereby guarantees the critical section's consistency. */ - smp_store_release(&sem->readers_block, 0); + atomic_set_release(&sem->block, 0); /* - * Release the write lock, this will allow readers back in the game. + * Prod any pending reader/writer to make progress. */ - up_write(&sem->rw_sem); + __wake_up(&sem->waiters, TASK_NORMAL, 1, sem); /* * Once this completes (at least one RCU-sched grace period hence) the diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 851bbb10819d..c9f090d64f00 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -57,7 +57,7 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; - lock->owner = (struct task_struct *)val; + WRITE_ONCE(lock->owner, (struct task_struct *)val); } static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 0d9b6be9ecc8..f11b9bd3431d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,7 +28,6 @@ #include <linux/rwsem.h> #include <linux/atomic.h> -#include "rwsem.h" #include "lock_events.h" /* @@ -329,7 +328,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, * Make sure we are not reinitializing a held semaphore: */ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); + lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); #endif #ifdef CONFIG_DEBUG_RWSEMS sem->magic = sem; @@ -660,8 +659,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, unsigned long flags; bool ret = true; - BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); - if (need_resched()) { lockevent_inc(rwsem_opt_fail); return false; @@ -1338,7 +1335,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_semaphore *sem) { if (!rwsem_read_trylock(sem)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); @@ -1426,7 +1423,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_semaphore *sem) { long tmp; diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 2534ce49f648..e69de29bb2d1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __INTERNAL_RWSEM_H -#define __INTERNAL_RWSEM_H -#include <linux/rwsem.h> - -extern void __down_read(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); - -#endif /* __INTERNAL_RWSEM_H */ diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 472dd462a40c..b9d93087ee66 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -14,14 +14,14 @@ #include <linux/export.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, short inner) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, inner); #endif lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; lock->magic = SPINLOCK_MAGIC; @@ -39,7 +39,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG); #endif lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; lock->magic = RWLOCK_MAGIC; diff --git a/kernel/notifier.c b/kernel/notifier.c index 63d7501ac638..5989bbb93039 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_all(); + vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/pid.c b/kernel/pid.c index 0f4ecb57214c..647b4bb457b5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,6 +247,16 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } + /* + * ENOMEM is not the most obvious choice especially for the case + * where the child subreaper has already exited and the pid + * namespace denies the creation of any new processes. But ENOMEM + * is what we have exposed to userspace for a long time and it is + * documented behavior for pid namespaces. So we can't easily + * change it even if there were an error code better suited. + */ + retval = -ENOMEM; + if (unlikely(is_child_reaper(pid))) { if (pid_ns_prepare_proc(ns)) goto out_free; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 83edf8698118..db0bed2cae26 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -1,31 +1,21 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * This module exposes the interface to kernel space for specifying - * QoS dependencies. It provides infrastructure for registration of: + * Power Management Quality of Service (PM QoS) support base. * - * Dependents on a QoS value : register requests - * Watchers of QoS value : get notified when target QoS value changes + * Copyright (C) 2020 Intel Corporation * - * This QoS design is best effort based. Dependents register their QoS needs. - * Watchers register to keep track of the current QoS needs of the system. + * Authors: + * Mark Gross <mgross@linux.intel.com> + * Rafael J. Wysocki <rafael.j.wysocki@intel.com> * - * There are 3 basic classes of QoS parameter: latency, timeout, throughput - * each have defined units: - * latency: usec - * timeout: usec <-- currently not used. - * throughput: kbs (kilo byte / sec) + * Provided here is an interface for specifying PM QoS dependencies. It allows + * entities depending on QoS constraints to register their requests which are + * aggregated as appropriate to produce effective constraints (target values) + * that can be monitored by entities needing to respect them, either by polling + * or through a built-in notification mechanism. * - * There are lists of pm_qos_objects each one wrapping requests, notifiers - * - * User mode requests on a QOS parameter register themselves to the - * subsystem by opening the device node /dev/... and writing there request to - * the node. As long as the process holds a file handle open to the node the - * client continues to be accounted for. Upon file release the usermode - * request is removed and a new qos target is computed. This way when the - * request that the application has is cleaned up when closes the file - * pointer or exits the pm_qos_object will get an opportunity to clean up. - * - * Mark Gross <mgross@linux.intel.com> + * In addition to the basic functionality, more specific interfaces for managing + * global CPU latency QoS requests and frequency QoS requests are provided. */ /*#define DEBUG*/ @@ -54,56 +44,19 @@ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct pm_qos_object { - struct pm_qos_constraints *constraints; - struct miscdevice pm_qos_power_miscdev; - char *name; -}; - static DEFINE_SPINLOCK(pm_qos_lock); -static struct pm_qos_object null_pm_qos; - -static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); -static struct pm_qos_constraints cpu_dma_constraints = { - .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), - .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN, - .notifiers = &cpu_dma_lat_notifier, -}; -static struct pm_qos_object cpu_dma_pm_qos = { - .constraints = &cpu_dma_constraints, - .name = "cpu_dma_latency", -}; - -static struct pm_qos_object *pm_qos_array[] = { - &null_pm_qos, - &cpu_dma_pm_qos, -}; - -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos); -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos); -static int pm_qos_power_open(struct inode *inode, struct file *filp); -static int pm_qos_power_release(struct inode *inode, struct file *filp); - -static const struct file_operations pm_qos_power_fops = { - .write = pm_qos_power_write, - .read = pm_qos_power_read, - .open = pm_qos_power_open, - .release = pm_qos_power_release, - .llseek = noop_llseek, -}; - -/* unlocked internal variant */ -static inline int pm_qos_get_value(struct pm_qos_constraints *c) +/** + * pm_qos_read_value - Return the current effective constraint value. + * @c: List of PM QoS constraint requests. + */ +s32 pm_qos_read_value(struct pm_qos_constraints *c) { - struct plist_node *node; - int total_value = 0; + return READ_ONCE(c->target_value); +} +static int pm_qos_get_value(struct pm_qos_constraints *c) +{ if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -114,111 +67,42 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; - case PM_QOS_SUM: - plist_for_each(node, &c->list) - total_value += node->prio; - - return total_value; - default: - /* runtime check for not using enum */ - BUG(); + WARN(1, "Unknown PM QoS type in %s\n", __func__); return PM_QOS_DEFAULT_VALUE; } } -s32 pm_qos_read_value(struct pm_qos_constraints *c) -{ - return c->target_value; -} - -static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) +static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) { - c->target_value = value; + WRITE_ONCE(c->target_value, value); } -static int pm_qos_debug_show(struct seq_file *s, void *unused) -{ - struct pm_qos_object *qos = (struct pm_qos_object *)s->private; - struct pm_qos_constraints *c; - struct pm_qos_request *req; - char *type; - unsigned long flags; - int tot_reqs = 0; - int active_reqs = 0; - - if (IS_ERR_OR_NULL(qos)) { - pr_err("%s: bad qos param!\n", __func__); - return -EINVAL; - } - c = qos->constraints; - if (IS_ERR_OR_NULL(c)) { - pr_err("%s: Bad constraints on qos?\n", __func__); - return -EINVAL; - } - - /* Lock to ensure we have a snapshot */ - spin_lock_irqsave(&pm_qos_lock, flags); - if (plist_head_empty(&c->list)) { - seq_puts(s, "Empty!\n"); - goto out; - } - - switch (c->type) { - case PM_QOS_MIN: - type = "Minimum"; - break; - case PM_QOS_MAX: - type = "Maximum"; - break; - case PM_QOS_SUM: - type = "Sum"; - break; - default: - type = "Unknown"; - } - - plist_for_each_entry(req, &c->list, node) { - char *state = "Default"; - - if ((req->node).prio != c->default_value) { - active_reqs++; - state = "Active"; - } - tot_reqs++; - seq_printf(s, "%d: %d: %s\n", tot_reqs, - (req->node).prio, state); - } - - seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n", - type, pm_qos_get_value(c), active_reqs, tot_reqs); - -out: - spin_unlock_irqrestore(&pm_qos_lock, flags); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(pm_qos_debug); - /** - * pm_qos_update_target - manages the constraints list and calls the notifiers - * if needed - * @c: constraints data struct - * @node: request to add to the list, to update or to remove - * @action: action to take on the constraints list - * @value: value of the request to add or update + * pm_qos_update_target - Update a list of PM QoS constraint requests. + * @c: List of PM QoS requests. + * @node: Target list entry. + * @action: Action to carry out (add, update or remove). + * @value: New request value for the target list entry. * - * This function returns 1 if the aggregated constraint value has changed, 0 - * otherwise. + * Update the given list of PM QoS constraint requests, @c, by carrying an + * @action involving the @node list entry and @value on it. + * + * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node + * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store + * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove + * @node from the list, ignore @value). + * + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value) { - unsigned long flags; int prev_value, curr_value, new_value; - int ret; + unsigned long flags; spin_lock_irqsave(&pm_qos_lock, flags); + prev_value = pm_qos_get_value(c); if (value == PM_QOS_DEFAULT_VALUE) new_value = c->default_value; @@ -231,9 +115,8 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, break; case PM_QOS_UPDATE_REQ: /* - * to change the list, we atomically remove, reinit - * with new value and add, then see if the extremal - * changed + * To change the list, atomically remove, reinit with new value + * and add, then see if the aggregate has changed. */ plist_del(node, &c->list); /* fall through */ @@ -252,16 +135,14 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, spin_unlock_irqrestore(&pm_qos_lock, flags); trace_pm_qos_update_target(action, prev_value, curr_value); - if (prev_value != curr_value) { - ret = 1; - if (c->notifiers) - blocking_notifier_call_chain(c->notifiers, - (unsigned long)curr_value, - NULL); - } else { - ret = 0; - } - return ret; + + if (prev_value == curr_value) + return 0; + + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, curr_value, NULL); + + return 1; } /** @@ -283,14 +164,12 @@ static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, /** * pm_qos_update_flags - Update a set of PM QoS flags. - * @pqf: Set of flags to update. + * @pqf: Set of PM QoS flags to update. * @req: Request to add to the set, to modify, or to remove from the set. * @action: Action to take on the set. * @val: Value of the request to add or modify. * - * Update the given set of PM QoS flags and call notifiers if the aggregate - * value has changed. Returns 1 if the aggregate constraint value has changed, - * 0 otherwise. + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ bool pm_qos_update_flags(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req, @@ -326,288 +205,180 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, spin_unlock_irqrestore(&pm_qos_lock, irqflags); trace_pm_qos_update_flags(action, prev_value, curr_value); - return prev_value != curr_value; -} -/** - * pm_qos_request - returns current system wide qos expectation - * @pm_qos_class: identification of which qos value is requested - * - * This function returns the current target value. - */ -int pm_qos_request(int pm_qos_class) -{ - return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); -} -EXPORT_SYMBOL_GPL(pm_qos_request); - -int pm_qos_request_active(struct pm_qos_request *req) -{ - return req->pm_qos_class != 0; + return prev_value != curr_value; } -EXPORT_SYMBOL_GPL(pm_qos_request_active); -static void __pm_qos_update_request(struct pm_qos_request *req, - s32 new_value) -{ - trace_pm_qos_update_request(req->pm_qos_class, new_value); +#ifdef CONFIG_CPU_IDLE +/* Definitions related to the CPU latency QoS. */ - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); -} +static struct pm_qos_constraints cpu_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_latency_constraints.list), + .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .type = PM_QOS_MIN, +}; /** - * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout - * @work: work struct for the delayed work (timeout) - * - * This cancels the timeout request by falling back to the default at timeout. + * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. */ -static void pm_qos_work_fn(struct work_struct *work) +s32 cpu_latency_qos_limit(void) { - struct pm_qos_request *req = container_of(to_delayed_work(work), - struct pm_qos_request, - work); - - __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); + return pm_qos_read_value(&cpu_latency_constraints); } /** - * pm_qos_add_request - inserts new qos request into the list - * @req: pointer to a preallocated handle - * @pm_qos_class: identifies which list of qos request to use - * @value: defines the qos request + * cpu_latency_qos_request_active - Check the given PM QoS request. + * @req: PM QoS request to check. * - * This function inserts a new entry in the pm_qos_class list of requested qos - * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters and initializes the pm_qos_request - * handle. Caller needs to save this handle for later use in updates and - * removal. + * Return: 'true' if @req has been added to the CPU latency QoS list, 'false' + * otherwise. */ - -void pm_qos_add_request(struct pm_qos_request *req, - int pm_qos_class, s32 value) +bool cpu_latency_qos_request_active(struct pm_qos_request *req) { - if (!req) /*guard against callers passing in null */ - return; + return req->qos == &cpu_latency_constraints; +} +EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active); - if (pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); - return; - } - req->pm_qos_class = pm_qos_class; - INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); - trace_pm_qos_add_request(pm_qos_class, value); - pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, - &req->node, PM_QOS_ADD_REQ, value); +static void cpu_latency_qos_apply(struct pm_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret = pm_qos_update_target(req->qos, &req->node, action, value); + if (ret > 0) + wake_up_all_idle_cpus(); } -EXPORT_SYMBOL_GPL(pm_qos_add_request); /** - * pm_qos_update_request - modifies an existing qos request - * @req : handle to list element holding a pm_qos request to use - * @value: defines the qos request + * cpu_latency_qos_add_request - Add new CPU latency QoS request. + * @req: Pointer to a preallocated handle. + * @value: Requested constraint value. * - * Updates an existing qos request for the pm_qos_class of parameters along - * with updating the target pm_qos_class value. + * Use @value to initialize the request handle pointed to by @req, insert it as + * a new entry to the CPU latency QoS list and recompute the effective QoS + * constraint for that list. * - * Attempts are made to make this code callable on hot code paths. + * Callers need to save the handle for later use in updates and removal of the + * QoS request represented by it. */ -void pm_qos_update_request(struct pm_qos_request *req, - s32 new_value) +void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) { - if (!req) /*guard against callers passing in null */ + if (!req) return; - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + if (cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for already added request\n", __func__); return; } - cancel_delayed_work_sync(&req->work); - __pm_qos_update_request(req, new_value); + trace_pm_qos_add_request(value); + + req->qos = &cpu_latency_constraints; + cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value); } -EXPORT_SYMBOL_GPL(pm_qos_update_request); +EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); /** - * pm_qos_update_request_timeout - modifies an existing qos request temporarily. - * @req : handle to list element holding a pm_qos request to use - * @new_value: defines the temporal qos request - * @timeout_us: the effective duration of this qos request in usecs. + * cpu_latency_qos_update_request - Modify existing CPU latency QoS request. + * @req : QoS request to update. + * @new_value: New requested constraint value. * - * After timeout_us, this qos request is cancelled automatically. + * Use @new_value to update the QoS request represented by @req in the CPU + * latency QoS list along with updating the effective constraint value for that + * list. */ -void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, - unsigned long timeout_us) +void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) { if (!req) return; - if (WARN(!pm_qos_request_active(req), - "%s called for unknown object.", __func__)) + + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; + } - cancel_delayed_work_sync(&req->work); + trace_pm_qos_update_request(new_value); - trace_pm_qos_update_request_timeout(req->pm_qos_class, - new_value, timeout_us); - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + if (new_value == req->node.prio) + return; - schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); + cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); } +EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request); /** - * pm_qos_remove_request - modifies an existing qos request - * @req: handle to request list element + * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request. + * @req: QoS request to remove. * - * Will remove pm qos request from the list of constraints and - * recompute the current target value for the pm_qos_class. Call this - * on slow code paths. + * Remove the CPU latency QoS request represented by @req from the CPU latency + * QoS list along with updating the effective constraint value for that list. */ -void pm_qos_remove_request(struct pm_qos_request *req) +void cpu_latency_qos_remove_request(struct pm_qos_request *req) { - if (!req) /*guard against callers passing in null */ + if (!req) return; - /* silent return to keep pcm code cleaner */ - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; } - cancel_delayed_work_sync(&req->work); + trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE); - trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); - pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_REMOVE_REQ, - PM_QOS_DEFAULT_VALUE); + cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } -EXPORT_SYMBOL_GPL(pm_qos_remove_request); - -/** - * pm_qos_add_notifier - sets notification entry for changes to target value - * @pm_qos_class: identifies which qos target changes should be notified. - * @notifier: notifier block managed by caller. - * - * will register the notifier into a notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; - - retval = blocking_notifier_chain_register( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); - - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_add_notifier); - -/** - * pm_qos_remove_notifier - deletes notification entry from chain. - * @pm_qos_class: identifies which qos target changes are notified. - * @notifier: notifier block to be removed. - * - * will remove the notifier from the notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; +EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request); - retval = blocking_notifier_chain_unregister( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); +/* User space interface to the CPU latency QoS via misc device. */ - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); - -/* User space interface to PM QoS classes via misc devices */ -static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) +static int cpu_latency_qos_open(struct inode *inode, struct file *filp) { - qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; - qos->pm_qos_power_miscdev.name = qos->name; - qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - - debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos, - &pm_qos_debug_fops); + struct pm_qos_request *req; - return misc_register(&qos->pm_qos_power_miscdev); -} + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; -static int find_pm_qos_object_by_minor(int minor) -{ - int pm_qos_class; + cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; - for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY; - pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { - if (minor == - pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) - return pm_qos_class; - } - return -1; + return 0; } -static int pm_qos_power_open(struct inode *inode, struct file *filp) +static int cpu_latency_qos_release(struct inode *inode, struct file *filp) { - long pm_qos_class; - - pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); - if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) { - struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - return -ENOMEM; - - pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); - filp->private_data = req; - - return 0; - } - return -EPERM; -} + struct pm_qos_request *req = filp->private_data; -static int pm_qos_power_release(struct inode *inode, struct file *filp) -{ - struct pm_qos_request *req; + filp->private_data = NULL; - req = filp->private_data; - pm_qos_remove_request(req); + cpu_latency_qos_remove_request(req); kfree(req); return 0; } - -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos) +static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) { - s32 value; - unsigned long flags; struct pm_qos_request *req = filp->private_data; + unsigned long flags; + s32 value; - if (!req) - return -EINVAL; - if (!pm_qos_request_active(req)) + if (!req || !cpu_latency_qos_request_active(req)) return -EINVAL; spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); + value = pm_qos_get_value(&cpu_latency_constraints); spin_unlock_irqrestore(&pm_qos_lock, flags); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); } -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos) +static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) { s32 value; - struct pm_qos_request *req; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) @@ -620,36 +391,38 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, return ret; } - req = filp->private_data; - pm_qos_update_request(req, value); + cpu_latency_qos_update_request(filp->private_data, value); return count; } +static const struct file_operations cpu_latency_qos_fops = { + .write = cpu_latency_qos_write, + .read = cpu_latency_qos_read, + .open = cpu_latency_qos_open, + .release = cpu_latency_qos_release, + .llseek = noop_llseek, +}; -static int __init pm_qos_power_init(void) -{ - int ret = 0; - int i; - struct dentry *d; - - BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); +static struct miscdevice cpu_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_dma_latency", + .fops = &cpu_latency_qos_fops, +}; - d = debugfs_create_dir("pm_qos", NULL); +static int __init cpu_latency_qos_init(void) +{ + int ret; - for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { - ret = register_pm_qos_misc(pm_qos_array[i], d); - if (ret < 0) { - pr_err("%s: %s setup failed\n", - __func__, pm_qos_array[i]->name); - return ret; - } - } + ret = misc_register(&cpu_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_latency_qos_miscdev.name); return ret; } - -late_initcall(pm_qos_power_init); +late_initcall(cpu_latency_qos_init); +#endif /* CONFIG_CPU_IDLE */ /* Definitions related to the frequency QoS below. */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 77438954cc2b..58ed9478787f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -409,21 +409,7 @@ snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case SNAPSHOT_GET_IMAGE_SIZE: case SNAPSHOT_AVAIL_SWAP_SIZE: - case SNAPSHOT_ALLOC_SWAP_PAGE: { - compat_loff_t __user *uoffset = compat_ptr(arg); - loff_t offset; - mm_segment_t old_fs; - int err; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = snapshot_ioctl(file, cmd, (unsigned long) &offset); - set_fs(old_fs); - if (!err && put_user(offset, uoffset)) - err = -EFAULT; - return err; - } - + case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_CREATE_IMAGE: return snapshot_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 82d5fba48b2f..f91f2c2cf138 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,6 +3,10 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +ifeq ($(CONFIG_KCSAN),y) +KBUILD_CFLAGS += -g -fno-omit-frame-pointer +endif + obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 05f936ed167a..00ddc92c5774 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,6 +198,13 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +extern int rcu_cpu_stall_suppress_at_boot; + +static inline bool rcu_stall_is_suppressed_at_boot(void) +{ + return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended(); +} + #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; @@ -205,6 +212,11 @@ extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot() || rcu_cpu_stall_suppress; +} + #define rcu_ftrace_dump_stall_suppress() \ do { \ if (!rcu_cpu_stall_suppress) \ @@ -218,6 +230,11 @@ do { \ } while (0) #else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */ + +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot(); +} #define rcu_ftrace_dump_stall_suppress() #define rcu_ftrace_dump_stall_unsuppress() #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ @@ -325,7 +342,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ (cpu) <= rnp->grphi; \ (cpu) = cpumask_next((cpu), cpu_possible_mask)) @@ -335,7 +353,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) #define rcu_find_next_bit(rnp, cpu, mask) \ ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) #define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ - for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ (cpu) <= rnp->grphi; \ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5f4fd3b8777c..9a0f66133b4b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -182,7 +182,7 @@ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) { return rcu_segcblist_is_enabled(rsclp) && - &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; + &rsclp->head != READ_ONCE(rsclp->tails[RCU_DONE_TAIL]); } /* @@ -381,8 +381,6 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, return; /* Nothing to do. */ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); - rclp->head = NULL; - rclp->tail = &rclp->head; } /* diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index da94b89cd531..a4a8d097d84d 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/mm.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/err.h> @@ -611,6 +612,7 @@ kfree_perf_thread(void *arg) long me = (long)arg; struct kfree_obj *alloc_ptr; u64 start_time, end_time; + long long mem_begin, mem_during = 0; VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); @@ -626,6 +628,12 @@ kfree_perf_thread(void *arg) } do { + if (!mem_during) { + mem_during = mem_begin = si_mem_available(); + } else if (loop % (kfree_loops / 4) == 0) { + mem_during = (mem_during + si_mem_available()) / 2; + } + for (i = 0; i < kfree_alloc_num; i++) { alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) @@ -645,9 +653,11 @@ kfree_perf_thread(void *arg) else b_rcu_gp_test_finished = cur_ops->get_gp_seq(); - pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n", + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, - rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started)); + rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + if (shutdown) { smp_mb(); /* Assign before wake. */ wake_up(&shutdown_wq); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1aeecc165b21..5453bd557f43 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -339,7 +339,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!rcu_fwd_cb_nodelay && + if (!READ_ONCE(rcu_fwd_cb_nodelay) && !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); @@ -375,11 +375,12 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; - i = rp->rtort_pipe_count; + i = READ_ONCE(rp->rtort_pipe_count); if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + WRITE_ONCE(rp->rtort_pipe_count, i + 1); + if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; } @@ -1015,7 +1016,8 @@ rcu_torture_writer(void *arg) if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; + WRITE_ONCE(old_rp->rtort_pipe_count, + old_rp->rtort_pipe_count + 1); switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1067,7 +1069,8 @@ rcu_torture_writer(void *arg) if (stutter_wait("rcu_torture_writer") && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && - !torture_must_stop()) + !torture_must_stop() && + rcu_inkernel_boot_has_ended()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != @@ -1290,7 +1293,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) atomic_inc(&n_rcu_torture_mberror); rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); - pipe_count = p->rtort_pipe_count; + pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; @@ -1404,14 +1407,15 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; struct task_struct *wtp; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); + batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } } for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { @@ -1420,9 +1424,10 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); + rtcp = rcu_access_pointer(rcu_torture_current); pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current ? "ver" : "VER", + rtcp, + rtcp && !rcu_stall_is_suppressed_at_boot() ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), @@ -1478,7 +1483,8 @@ rcu_torture_stats_print(void) if (cur_ops->stats) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && - rcu_torture_current != NULL) { + rcu_access_pointer(rcu_torture_current) && + !rcu_stall_is_suppressed()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; @@ -1993,8 +1999,11 @@ static int rcu_torture_fwd_prog(void *args) schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); register_oom_notifier(&rcutorture_oom_nb); - rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); - rcu_torture_fwd_prog_cr(rfp); + if (!IS_ENABLED(CONFIG_TINY_RCU) || + rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); + if (rcu_inkernel_boot_has_ended()) + rcu_torture_fwd_prog_cr(rfp); unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ @@ -2044,6 +2053,14 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) atomic_inc(&barrier_cbs_invoked); } +/* IPI handler to get callback posted on desired CPU, if online. */ +static void rcu_torture_barrier1cb(void *rcu_void) +{ + struct rcu_head *rhp = rcu_void; + + cur_ops->call(rhp, rcu_torture_barrier_cbf); +} + /* kthread function to register callbacks used to test RCU barriers. */ static int rcu_torture_barrier_cbs(void *arg) { @@ -2067,9 +2084,11 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the following ->call(). */ - local_irq_disable(); /* Just to test no-irq call_rcu(). */ - cur_ops->call(&rcu, rcu_torture_barrier_cbf); - local_irq_enable(); + if (smp_call_function_single(myid, rcu_torture_barrier1cb, + &rcu, 1)) { + // IPI failed, so use direct call from current CPU. + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + } if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -2105,7 +2124,21 @@ static int rcu_torture_barrier(void *arg) pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", atomic_read(&barrier_cbs_invoked), n_barrier_cbs); - WARN_ON_ONCE(1); + WARN_ON(1); + // Wait manually for the remaining callbacks + i = 0; + do { + if (WARN_ON(i++ > HZ)) + i = INT_MIN; + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + } while (atomic_read(&barrier_cbs_invoked) != + n_barrier_cbs && + !torture_must_stop()); + smp_mb(); // Can't trust ordering if broken. + if (!torture_must_stop()) + pr_err("Recovered: barrier_cbs_invoked = %d\n", + atomic_read(&barrier_cbs_invoked)); } else { n_barrier_successes++; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 657e6a7d1c03..0c71505f0e19 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -5,7 +5,7 @@ * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney <paulmck@linux.ibm.com> + * Authors: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - @@ -450,7 +450,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); - state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + state = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -534,7 +534,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - ssp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); spin_unlock_irq_rcu_node(ssp); mutex_unlock(&ssp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -550,7 +550,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - snp->srcu_gp_seq_needed_exp = gpseq; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); @@ -614,7 +614,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp } spin_lock_irqsave_rcu_node(ssp, flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); } @@ -660,7 +660,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp == sdp->mynode) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - snp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } @@ -674,7 +674,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ } if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); /* If grace period not already done and none in progress, start it. */ if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && @@ -1079,7 +1079,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return ssp->srcu_idx; + return READ_ONCE(ssp->srcu_idx); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1130,7 +1130,9 @@ static void srcu_advance_state(struct srcu_struct *ssp) return; /* readers present, retry later. */ } srcu_flip(ssp); + spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + spin_unlock_irq_rcu_node(ssp); } if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d91c9156fab2..06548e2ebb72 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update mechanism for mutual exclusion + * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version + * Paul E. McKenney <paulmck@linux.ibm.com> * * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. @@ -150,6 +150,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; @@ -342,14 +343,17 @@ bool rcu_eqs_special_set(int cpu) { int old; int new; + int new_old; struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + new_old = atomic_read(&rdp->dynticks); do { - old = atomic_read(&rdp->dynticks); + old = new_old; if (old & RCU_DYNTICK_CTRL_CTR) return false; new = old | RCU_DYNTICK_CTRL_MASK; - } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old); + new_old = atomic_cmpxchg(&rdp->dynticks, old, new); + } while (new_old != old); return true; } @@ -410,10 +414,15 @@ static long blimit = DEFAULT_RCU_BLIMIT; static long qhimark = DEFAULT_RCU_QHIMARK; #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ static long qlowmark = DEFAULT_RCU_QLOMARK; +#define DEFAULT_RCU_QOVLD_MULT 2 +#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) +static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ +static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); +module_param(qovld, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; @@ -818,11 +827,12 @@ static __always_inline void rcu_nmi_enter_common(bool irq) incby = 1; } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { + READ_ONCE(rdp->rcu_urgent_qs) && + !READ_ONCE(rdp->rcu_forced_tick)) { raw_spin_lock_rcu_node(rdp->mynode); // Recheck under lock. if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; + WRITE_ONCE(rdp->rcu_forced_tick, true); tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } raw_spin_unlock_rcu_node(rdp->mynode); @@ -899,7 +909,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - rdp->rcu_forced_tick = false; + WRITE_ONCE(rdp->rcu_forced_tick, false); } } @@ -1072,7 +1082,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || - time_after(jiffies, rcu_state.jiffies_resched))) { + time_after(jiffies, rcu_state.jiffies_resched) || + rcu_state.cbovld)) { WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); @@ -1089,8 +1100,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * So hit them over the head with the resched_cpu() hammer! */ if (tick_nohz_full_cpu(rdp->cpu) && - time_after(jiffies, - READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { + (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || + rcu_state.cbovld)) { WRITE_ONCE(*ruqp, true); resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); @@ -1113,6 +1124,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { init_irq_work(&rdp->rcu_iw, rcu_iw_handler); + atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); rdp->rcu_iw_pending = true; rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); @@ -1126,8 +1138,9 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req, - rnp->level, rnp->grplo, rnp->grphi, s); + trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + gp_seq_req, rnp->level, + rnp->grplo, rnp->grphi, s); } /* @@ -1174,7 +1187,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, TPS("Prestarted")); goto unlock_out; } - rnp->gp_seq_needed = gp_seq_req; + WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req); if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* * We just marked the leaf or internal node, and a @@ -1199,18 +1212,18 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, } trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; - if (!rcu_state.gp_kthread) { + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); + if (!READ_ONCE(rcu_state.gp_kthread)) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { - rnp_start->gp_seq_needed = rnp->gp_seq_needed; - rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed); + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); } if (rnp != rnp_start) raw_spin_unlock_rcu_node(rnp); @@ -1235,12 +1248,13 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) } /* - * Awaken the grace-period kthread. Don't do a self-awaken (unless in - * an interrupt or softirq handler), and don't bother awakening when there - * is nothing for the grace-period kthread to do (as in several CPUs raced - * to awaken, and we lost), and finally don't try to awaken a kthread that - * has not yet been created. If all those checks are passed, track some - * debug information and awaken. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in an + * interrupt or softirq handler, in which case we just might immediately + * sleep upon return, resulting in a grace-period hang), and don't bother + * awakening when there is nothing for the grace-period kthread to do + * (as in several CPUs raced to awaken, we lost), and finally don't try + * to awaken a kthread that has not yet been created. If all those checks + * are passed, track some debug information and awaken. * * So why do the self-wakeup when in an interrupt or softirq handler * in the grace-period kthread's context? Because the kthread might have @@ -1250,10 +1264,10 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) */ static void rcu_gp_kthread_wake(void) { - if ((current == rcu_state.gp_kthread && - !in_irq() && !in_serving_softirq()) || - !READ_ONCE(rcu_state.gp_flags) || - !rcu_state.gp_kthread) + struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); + + if ((current == t && !in_irq() && !in_serving_softirq()) || + !READ_ONCE(rcu_state.gp_flags) || !t) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); @@ -1321,7 +1335,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, rcu_lockdep_assert_cblist_protected(rdp); c = rcu_seq_snap(&rcu_state.gp_seq); - if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { /* Old request still live, so mark recent callbacks. */ (void)rcu_segcblist_accelerate(&rdp->cblist, c); return; @@ -1386,7 +1400,7 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; - bool need_gp; + bool need_qs; const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rcu_segcblist_is_offloaded(&rdp->cblist); @@ -1400,10 +1414,13 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) unlikely(READ_ONCE(rdp->gpwrap))) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ + rdp->core_needs_qs = false; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); } else { if (!offloaded) ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */ + if (rdp->core_needs_qs) + rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); } /* Now handle the beginnings of any new-to-this-CPU grace periods. */ @@ -1415,14 +1432,14 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) * go looking for one. */ trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart")); - need_gp = !!(rnp->qsmask & rdp->grpmask); - rdp->cpu_no_qs.b.norm = need_gp; - rdp->core_needs_qs = need_gp; + need_qs = !!(rnp->qsmask & rdp->grpmask); + rdp->cpu_no_qs.b.norm = need_qs; + rdp->core_needs_qs = need_qs; zero_cpu_stall_ticks(rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) - rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -1651,8 +1668,7 @@ static void rcu_gp_fqs_loop(void) WRITE_ONCE(rcu_state.jiffies_kick_kthreads, jiffies + (j ? 3 * j : 2)); } - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( @@ -1666,13 +1682,11 @@ static void rcu_gp_fqs_loop(void) /* If time for quiescent-state forcing, do it. */ if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); first_gp_fqs = false; - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); @@ -1683,8 +1697,7 @@ static void rcu_gp_fqs_loop(void) cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswaitsig")); ret = 1; /* Keep old FQS timing. */ j = jiffies; @@ -1701,8 +1714,9 @@ static void rcu_gp_fqs_loop(void) */ static void rcu_gp_cleanup(void) { - unsigned long gp_duration; + int cpu; bool needgp = false; + unsigned long gp_duration; unsigned long new_gp_seq; bool offloaded; struct rcu_data *rdp; @@ -1748,6 +1762,12 @@ static void rcu_gp_cleanup(void) needgp = __note_gp_changes(rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ needgp = rcu_future_gp_cleanup(rnp) || needgp; + // Reset overload indication for CPUs no longer overloaded + if (rcu_is_leaf_node(rnp)) + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + check_cb_ovld_locked(rdp, rnp); + } sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); @@ -1774,9 +1794,9 @@ static void rcu_gp_cleanup(void) rcu_segcblist_is_offloaded(&rdp->cblist); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + rcu_state.gp_seq, TPS("newreq")); } else { WRITE_ONCE(rcu_state.gp_flags, @@ -1795,8 +1815,7 @@ static int __noreturn rcu_gp_kthread(void *unused) /* Handle grace-period start. */ for (;;) { - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwait")); rcu_state.gp_state = RCU_GP_WAIT_GPS; swait_event_idle_exclusive(rcu_state.gp_wq, @@ -1809,8 +1828,7 @@ static int __noreturn rcu_gp_kthread(void *unused) cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwaitsig")); } @@ -1881,7 +1899,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); - rnp->qsmask &= ~mask; + WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask); trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, @@ -1904,7 +1922,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); - oldmask = rnp_c->qsmask; + oldmask = READ_ONCE(rnp_c->qsmask); } /* @@ -1987,6 +2005,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; + if (rdp->cpu == smp_processor_id()) + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2052,7 +2072,7 @@ int rcutree_dying_cpu(unsigned int cpu) return 0; blkd = !!(rnp->qsmask & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); return 0; } @@ -2294,10 +2314,13 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) struct rcu_data *rdp; struct rcu_node *rnp; + rcu_state.cbovld = rcu_state.cbovldnext; + rcu_state.cbovldnext = false; rcu_for_each_leaf_node(rnp) { cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { @@ -2579,11 +2602,48 @@ static void rcu_leak_callback(struct rcu_head *rhp) } /* - * Helper function for call_rcu() and friends. The cpu argument will - * normally be -1, indicating "currently running CPU". It may specify - * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier() - * is expected to specify a CPU. + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. The caller must hold the leaf rcu_node + * structure's ->lock. */ +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp) +{ + raw_lockdep_assert_held_rcu_node(rnp); + if (qovld_calc <= 0) + return; // Early boot and wildcard value set. + if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask); + else + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask); +} + +/* + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. No locks need be held, but the + * caller must have disabled interrupts. + * + * Note that this function ignores the possibility that there are a lot + * of callbacks all of which have already seen the end of their respective + * grace periods. This omission is due to the need for no-CBs CPUs to + * be holding ->nocb_lock to do this check, which is too heavy for a + * common-case operation. + */ +static void check_cb_ovld(struct rcu_data *rdp) +{ + struct rcu_node *const rnp = rdp->mynode; + + if (qovld_calc <= 0 || + ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) == + !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask))) + return; // Early boot wildcard value or already set correctly. + raw_spin_lock_rcu_node(rnp); + check_cb_ovld_locked(rdp, rnp); + raw_spin_unlock_rcu_node(rnp); +} + +/* Helper function for call_rcu() and friends. */ static void __call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -2621,9 +2681,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) rcu_segcblist_init(&rdp->cblist); } + check_cb_ovld(rdp); if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. - /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ + // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, @@ -2689,22 +2750,47 @@ EXPORT_SYMBOL_GPL(call_rcu); #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) + +/** + * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers + * @nr_records: Number of active pointers in the array + * @records: Array of the kfree_rcu() pointers + * @next: Next bulk object in the block chain + * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set + */ +struct kfree_rcu_bulk_data { + unsigned long nr_records; + void *records[KFREE_BULK_MAX_ENTR]; + struct kfree_rcu_bulk_data *next; + struct rcu_head *head_free_debug; +}; + /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period + * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; + struct kfree_rcu_bulk_data *bhead_free; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period + * @bcached: Keeps at most one object for later reuse when build chain blocks * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -2718,6 +2804,8 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; + struct kfree_rcu_bulk_data *bhead; + struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; spinlock_t lock; struct delayed_work monitor_work; @@ -2727,14 +2815,24 @@ struct kfree_rcu_cpu { static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static __always_inline void +debug_rcu_head_unqueue_bulk(struct rcu_head *head) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + for (; head; head = head->next) + debug_rcu_head_unqueue(head); +#endif +} + /* * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->head_free. + * It frees all the objects queued on ->bhead_free or ->head_free. */ static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; struct rcu_head *head, *next; + struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; @@ -2744,22 +2842,44 @@ static void kfree_rcu_work(struct work_struct *work) spin_lock_irqsave(&krcp->lock, flags); head = krwp->head_free; krwp->head_free = NULL; + bhead = krwp->bhead_free; + krwp->bhead_free = NULL; spin_unlock_irqrestore(&krcp->lock, flags); - // List "head" is now private, so traverse locklessly. + /* "bhead" is now private, so traverse locklessly. */ + for (; bhead; bhead = bnext) { + bnext = bhead->next; + + debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, + bhead->nr_records, bhead->records); + + kfree_bulk(bhead->nr_records, bhead->records); + rcu_lock_release(&rcu_callback_map); + + if (cmpxchg(&krcp->bcached, NULL, bhead)) + free_page((unsigned long) bhead); + + cond_resched_tasks_rcu_qs(); + } + + /* + * Emergency case only. It can happen under low memory + * condition when an allocation gets failed, so the "bulk" + * path can not be temporary maintained. + */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; next = head->next; - // Potentially optimize with kfree_bulk in future. debug_rcu_head_unqueue(head); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { - /* Could be optimized with kfree_bulk() in future. */ + if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) kfree((void *)head - offset); - } rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -2774,26 +2894,48 @@ static void kfree_rcu_work(struct work_struct *work) */ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { + struct kfree_rcu_cpu_work *krwp; + bool queued = false; int i; - struct kfree_rcu_cpu_work *krwp = NULL; lockdep_assert_held(&krcp->lock); - for (i = 0; i < KFREE_N_BATCHES; i++) - if (!krcp->krw_arr[i].head_free) { - krwp = &(krcp->krw_arr[i]); - break; - } - // If a previous RCU batch is in progress, we cannot immediately - // queue another one, so return false to tell caller to retry. - if (!krwp) - return false; + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); - krwp->head_free = krcp->head; - krcp->head = NULL; - INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); - queue_rcu_work(system_wq, &krwp->rcu_work); - return true; + /* + * Try to detach bhead or head and attach it over any + * available corresponding free channel. It can be that + * a previous RCU batch is in progress, it means that + * immediately to queue another one is not possible so + * return false to tell caller to retry. + */ + if ((krcp->bhead && !krwp->bhead_free) || + (krcp->head && !krwp->head_free)) { + /* Channel 1. */ + if (!krwp->bhead_free) { + krwp->bhead_free = krcp->bhead; + krcp->bhead = NULL; + } + + /* Channel 2. */ + if (!krwp->head_free) { + krwp->head_free = krcp->head; + krcp->head = NULL; + } + + /* + * One work is per one batch, so there are two "free channels", + * "bhead_free" and "head_free" the batch can handle. It can be + * that the work is in the pending state when two channels have + * been detached following each other, one by one. + */ + queue_rcu_work(system_wq, &krwp->rcu_work); + queued = true; + } + } + + return queued; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, @@ -2830,19 +2972,65 @@ static void kfree_rcu_monitor(struct work_struct *work) spin_unlock_irqrestore(&krcp->lock, flags); } +static inline bool +kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, + struct rcu_head *head, rcu_callback_t func) +{ + struct kfree_rcu_bulk_data *bnode; + + if (unlikely(!krcp->initialized)) + return false; + + lockdep_assert_held(&krcp->lock); + + /* Check if a new block is required. */ + if (!krcp->bhead || + krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { + bnode = xchg(&krcp->bcached, NULL); + if (!bnode) { + WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + + bnode = (struct kfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + } + + /* Switch to emergency path. */ + if (unlikely(!bnode)) + return false; + + /* Initialize the new block. */ + bnode->nr_records = 0; + bnode->next = krcp->bhead; + bnode->head_free_debug = NULL; + + /* Attach it to the head. */ + krcp->bhead = bnode; + } + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + head->func = func; + head->next = krcp->bhead->head_free_debug; + krcp->bhead->head_free_debug = head; +#endif + + /* Finally insert. */ + krcp->bhead->records[krcp->bhead->nr_records++] = + (void *) head - (unsigned long) func; + + return true; +} + /* - * Queue a request for lazy invocation of kfree() after a grace period. + * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace + * period. Please note there are two paths are maintained, one is the main one + * that uses kfree_bulk() interface and second one is emergency one, that is + * used only when the main path can not be maintained temporary, due to memory + * pressure. * * Each kfree_call_rcu() request is added to a batch. The batch will be drained - * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch - * will be kfree'd in workqueue context. This allows us to: - * - * 1. Batch requests together to reduce the number of grace periods during - * heavy kfree_rcu() load. - * - * 2. It makes it possible to use kfree_bulk() on a large number of - * kfree_rcu() requests thus reducing cache misses and the per-object - * overhead of kfree(). + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will + * be free'd in workqueue context. This allows us to: batch requests together to + * reduce the number of grace periods during heavy kfree_rcu() load. */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -2861,9 +3049,16 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) __func__, head); goto unlock_return; } - head->func = func; - head->next = krcp->head; - krcp->head = head; + + /* + * Under high memory pressure GFP_NOWAIT can fail, + * in that case the emergency path is maintained. + */ + if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + head->func = func; + head->next = krcp->head; + krcp->head = head; + } // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && @@ -3075,24 +3270,32 @@ static void rcu_barrier_trace(const char *s, int cpu, unsigned long done) /* * RCU callback function for rcu_barrier(). If we are last, wake * up the task executing rcu_barrier(). + * + * Note that the value of rcu_state.barrier_sequence must be captured + * before the atomic_dec_and_test(). Otherwise, if this CPU is not last, + * other CPUs might count the value down to zero before this CPU gets + * around to invoking rcu_barrier_trace(), which might result in bogus + * data from the next instance of rcu_barrier(). */ static void rcu_barrier_callback(struct rcu_head *rhp) { + unsigned long __maybe_unused s = rcu_state.barrier_sequence; + if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { - rcu_barrier_trace(TPS("LastCB"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("LastCB"), -1, s); complete(&rcu_state.barrier_completion); } else { - rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("CB"), -1, s); } } /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *unused) +static void rcu_barrier_func(void *cpu_in) { - struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); + uintptr_t cpu = (uintptr_t)cpu_in; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; @@ -3119,7 +3322,7 @@ static void rcu_barrier_func(void *unused) */ void rcu_barrier(void) { - int cpu; + uintptr_t cpu; struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); @@ -3142,13 +3345,14 @@ void rcu_barrier(void) rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* - * Initialize the count to one rather than to zero in order to - * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Exclude CPU-hotplug operations - * to ensure that no offline CPU has callbacks queued. + * Initialize the count to two rather than to zero in order + * to avoid a too-soon return to zero in case of an immediate + * invocation of the just-enqueued callback (or preemption of + * this task). Exclude CPU-hotplug operations to ensure that no + * offline non-offloaded CPU has callbacks queued. */ init_completion(&rcu_state.barrier_completion); - atomic_set(&rcu_state.barrier_cpu_count, 1); + atomic_set(&rcu_state.barrier_cpu_count, 2); get_online_cpus(); /* @@ -3158,13 +3362,23 @@ void rcu_barrier(void) */ for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (!cpu_online(cpu) && + if (cpu_is_offline(cpu) && !rcu_segcblist_is_offloaded(&rdp->cblist)) continue; - if (rcu_segcblist_n_cbs(&rdp->cblist)) { + if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); + smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1); + } else if (rcu_segcblist_n_cbs(&rdp->cblist) && + cpu_is_offline(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, + rcu_state.barrier_sequence); + local_irq_disable(); + rcu_barrier_func((void *)cpu); + local_irq_enable(); + } else if (cpu_is_offline(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu, + rcu_state.barrier_sequence); } else { rcu_barrier_trace(TPS("OnlineNQ"), cpu, rcu_state.barrier_sequence); @@ -3176,7 +3390,7 @@ void rcu_barrier(void) * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) + if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count)) complete(&rcu_state.barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ @@ -3275,12 +3489,12 @@ int rcutree_prepare_cpu(unsigned int cpu) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rdp->beenonline = true; /* We have now been online. */ - rdp->gp_seq = rnp->gp_seq; - rdp->gp_seq_needed = rnp->gp_seq; + rdp->gp_seq = READ_ONCE(rnp->gp_seq); + rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; - rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; + rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); @@ -3378,7 +3592,7 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->qsmaskinitnext |= mask; + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); oldmask = rnp->expmaskinitnext; rnp->expmaskinitnext |= mask; oldmask ^= rnp->expmaskinitnext; @@ -3431,7 +3645,7 @@ void rcu_report_dead(unsigned int cpu) rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } - rnp->qsmaskinitnext &= ~mask; + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); @@ -3545,7 +3759,10 @@ static int __init rcu_spawn_gp_kthread(void) } rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); - rcu_state.gp_kthread = t; + WRITE_ONCE(rcu_state.gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); + // Reset .gp_activity and .gp_req_activity before setting .gp_kthread. + smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); rcu_spawn_nocb_kthreads(); @@ -3769,8 +3986,11 @@ static void __init kfree_rcu_batch_init(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_init(&krcp->lock); - for (i = 0; i < KFREE_N_BATCHES; i++) + for (i = 0; i < KFREE_N_BATCHES; i++) { + INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } @@ -3809,6 +4029,13 @@ void __init rcu_init(void) rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_par_gp_wq); srcu_init(); + + /* Fill in default value for rcutree.qovld boot parameter. */ + /* -After- the rcu_node ->lock fields are initialized! */ + if (qovld < 0) + qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; + else + qovld_calc = qovld; } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0c87e4c161c2..9dc2ec021da5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -68,6 +68,8 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + unsigned long cbovldmask; + /* CPUs experiencing callback overload. */ unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ @@ -321,6 +323,8 @@ struct rcu_state { atomic_t expedited_need_qs; /* # CPUs left to check in. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ + u8 cbovld; /* Callback overload now? */ + u8 cbovldnext; /* ^ ^ next time? */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dcbd75791f39..1a617b9dffb0 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -314,7 +314,7 @@ static bool exp_funnel_lock(unsigned long s) sync_exp_work_done(s)); return true; } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ + WRITE_ONCE(rnp->exp_seq_rq, s); /* Followers can wait on us. */ spin_unlock(&rnp->exp_lock); trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, rnp->grplo, rnp->grphi, TPS("nxtlvl")); @@ -485,6 +485,7 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit) static void synchronize_rcu_expedited_wait(void) { int cpu; + unsigned long j; unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; @@ -496,7 +497,7 @@ static void synchronize_rcu_expedited_wait(void) trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; - if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { @@ -508,12 +509,16 @@ static void synchronize_rcu_expedited_wait(void) tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); } } + j = READ_ONCE(jiffies_till_first_fqs); + if (synchronize_rcu_expedited_wait_once(j + HZ)) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)); } for (;;) { if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) continue; panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", @@ -589,7 +594,7 @@ static void rcu_exp_wait_wake(unsigned long s) spin_lock(&rnp->exp_lock); /* Recheck, avoid hang in case someone just arrived. */ if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; + WRITE_ONCE(rnp->exp_seq_rq, s); spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c6ea81cd4189..097635c41135 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -56,6 +56,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); if (qlowmark != DEFAULT_RCU_QLOMARK) pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (qovld != DEFAULT_RCU_QOVLD) + pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld); if (jiffies_till_first_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) @@ -753,7 +755,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) raw_lockdep_assert_held_rcu_node(rnp); pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, - (long)rnp->gp_seq, (long)rnp->completedqs); + (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs); for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); @@ -1032,18 +1034,18 @@ static int rcu_boost_kthread(void *arg) trace_rcu_utilization(TPS("Start boost kthread@init")); for (;;) { - rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); rcu_wait(rnp->boost_tasks || rnp->exp_tasks); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); - rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); if (more2boost) spincnt++; else spincnt = 0; if (spincnt > 10) { - rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); schedule_timeout_interruptible(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); @@ -1077,12 +1079,12 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - ULONG_CMP_GE(jiffies, rnp->boost_time))) { + (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, - rnp->boost_kthread_status); + READ_ONCE(rnp->boost_kthread_status)); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1486,6 +1488,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0); * flag the contention. */ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) + __acquires(&rdp->nocb_bypass_lock) { lockdep_assert_irqs_disabled(); if (raw_spin_trylock(&rdp->nocb_bypass_lock)) @@ -1529,6 +1532,7 @@ static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) * Release the specified rcu_data structure's ->nocb_bypass_lock. */ static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) + __releases(&rdp->nocb_bypass_lock) { lockdep_assert_irqs_disabled(); raw_spin_unlock(&rdp->nocb_bypass_lock); @@ -1577,8 +1581,7 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist) && - cpu_online(rdp->cpu)) + if (rcu_segcblist_is_offloaded(&rdp->cblist)) lockdep_assert_held(&rdp->nocb_lock); } @@ -1930,6 +1933,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) struct rcu_data *rdp; struct rcu_node *rnp; unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. + bool wasempty = false; /* * Each pass through the following loop checks for CBs and for the @@ -1969,10 +1973,13 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ needwake_gp = rcu_advance_cbs(rnp, rdp); + wasempty = rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL); raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ } // Need to wait on some grace period? - WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist, + WARN_ON_ONCE(wasempty && + !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)); if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { if (!needwait_gp || diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 55f9b84790d3..119ed6afd20f 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -102,7 +102,7 @@ static void record_gp_stall_check_time(void) unsigned long j = jiffies; unsigned long j1; - rcu_state.gp_start = j; + WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); /* Record ->gp_start before ->jiffies_stall. */ smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ @@ -383,7 +383,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) return; /* @@ -452,7 +452,7 @@ static void print_cpu_stall(void) /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) + if (rcu_stall_is_suppressed()) return; /* @@ -504,7 +504,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) || !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); @@ -578,6 +578,7 @@ void show_rcu_gp_kthreads(void) unsigned long jw; struct rcu_data *rdp; struct rcu_node *rnp; + struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; ja = j - READ_ONCE(rcu_state.gp_activity); @@ -585,28 +586,28 @@ void show_rcu_gp_kthreads(void) jw = j - READ_ONCE(rcu_state.gp_wake_time); pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, - rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, + rcu_state.gp_state, t ? t->state : 0x1ffffL, ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), (long)READ_ONCE(rcu_state.gp_seq), (long)READ_ONCE(rcu_get_root()->gp_seq_needed), READ_ONCE(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) + if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), + READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)rnp->gp_seq, - (long)rnp->gp_seq_needed); + rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), + (long)READ_ONCE(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->gpwrap || - ULONG_CMP_GE(rcu_state.gp_seq, - rdp->gp_seq_needed)) + if (READ_ONCE(rdp->gpwrap) || + ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), + READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)rdp->gp_seq_needed); + cpu, (long)READ_ONCE(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { @@ -631,7 +632,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, static atomic_t warned = ATOMIC_INIT(0); if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || + !smp_load_acquire(&rcu_state.gp_kthread)) // Get stable kthread. return; j = jiffies; /* Expensive access, and in common case don't get here. */ if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || @@ -642,7 +645,8 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_lock_irqsave_rcu_node(rnp, flags); j = jiffies; if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_read(&warned)) { @@ -655,9 +659,10 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ j = jiffies; if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rcu_state.gp_req_activity + gpssdelay) || - time_before(j, rcu_state.gp_activity + gpssdelay) || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_xchg(&warned, 1)) { if (rnp_root != rnp) /* irqs remain disabled. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6c4b862f57d6..28a8bdc5072f 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -183,6 +183,8 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); +static bool rcu_boot_ended __read_mostly; + /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -191,7 +193,17 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); + rcu_boot_ended = 1; +} + +/* + * Let rcutorture know when it is OK to turn it up to eleven. + */ +bool rcu_inkernel_boot_has_ended(void) +{ + return rcu_boot_ended; } +EXPORT_SYMBOL_GPL(rcu_inkernel_boot_has_ended); #endif /* #ifndef CONFIG_TINY_RCU */ @@ -227,18 +239,30 @@ core_initcall(rcu_set_runtime_mode); #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +struct lockdep_map rcu_lock_map = { + .name = "rcu_read_lock", + .key = &rcu_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */ +}; EXPORT_SYMBOL_GPL(rcu_lock_map); static struct lock_class_key rcu_bh_lock_key; -struct lockdep_map rcu_bh_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); +struct lockdep_map rcu_bh_lock_map = { + .name = "rcu_read_lock_bh", + .key = &rcu_bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */ +}; EXPORT_SYMBOL_GPL(rcu_bh_lock_map); static struct lock_class_key rcu_sched_lock_key; -struct lockdep_map rcu_sched_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); +struct lockdep_map rcu_sched_lock_map = { + .name = "rcu_read_lock_sched", + .key = &rcu_sched_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_SPIN, +}; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); static struct lock_class_key rcu_callback_key; @@ -464,13 +488,19 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_ftrace_dump __read_mostly; module_param(rcu_cpu_stall_ftrace_dump, int, 0644); -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ +// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall +// warnings. Also used by rcutorture even if stall warnings are excluded. +int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. +EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); +module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); + #ifdef CONFIG_TASKS_RCU /* @@ -528,7 +558,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) rhp->func = func; raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); needwake = !rcu_tasks_cbs_head; - *rcu_tasks_cbs_tail = rhp; + WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); /* We can't create the thread unless interrupts are enabled. */ @@ -658,7 +688,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* If there were none, wait a bit and start over. */ if (!list) { wait_event_interruptible(rcu_tasks_cbs_wq, - rcu_tasks_cbs_head); + READ_ONCE(rcu_tasks_cbs_head)); if (!rcu_tasks_cbs_head) { WARN_ON(signal_pending(current)); schedule_timeout_interruptible(HZ/10); @@ -801,7 +831,7 @@ static int __init rcu_spawn_tasks_kthread(void) core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) { preempt_disable(); current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); @@ -809,7 +839,7 @@ void exit_tasks_rcu_start(void) } /* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) { preempt_disable(); __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a1ad5b7d5521..a778554f9dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -29,12 +29,12 @@ void complete(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (x->done != UINT_MAX) x->done++; - __wake_up_locked(&x->wait, TASK_NORMAL, 1); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -58,10 +58,12 @@ void complete_all(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + lockdep_assert_RT_in_threaded_ctx(); + + raw_spin_lock_irqsave(&x->wait.lock, flags); x->done = UINT_MAX; - __wake_up_locked(&x->wait, TASK_NORMAL, 0); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_all_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -70,20 +72,20 @@ do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { - DECLARE_WAITQUEUE(wait, current); + DECLARE_SWAITQUEUE(wait); - __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } + __prepare_to_swait(&x->wait, &wait); __set_current_state(state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); timeout = action(timeout); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); + __finish_swait(&x->wait, &wait); if (!x->done) return timeout; } @@ -100,9 +102,9 @@ __wait_for_common(struct completion *x, complete_acquire(x); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); complete_release(x); @@ -291,12 +293,12 @@ bool try_wait_for_completion(struct completion *x) if (!READ_ONCE(x->done)) return false; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = false; else if (x->done != UINT_MAX) x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -322,8 +324,8 @@ bool completion_done(struct completion *x) * otherwise we can end up freeing the completion before complete() * is done referencing it. */ - spin_lock_irqsave(&x->wait.lock, flags); - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1e72d1b3d3ce..464742874be3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2539,3 +2539,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } #endif + +void swake_up_all_locked(struct swait_queue_head *q); +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e83a3f8449f6..e1c655f928c7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -32,6 +32,19 @@ void swake_up_locked(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_locked); +/* + * Wake up all waiters. This is an interface which is solely exposed for + * completions and not for general usage. + * + * It is intentionally different from swake_up_all() to allow usage from + * hard interrupt context and interrupt disabled regions. + */ +void swake_up_all_locked(struct swait_queue_head *q) +{ + while (!list_empty(&q->task_list)) + swake_up_locked(q); +} + void swake_up_one(struct swait_queue_head *q) { unsigned long flags; @@ -69,7 +82,7 @@ void swake_up_all(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_all); -static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) { wait->task = current; if (list_empty(&wait->task_list)) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b6ea3dcb57bf..ec5c606bc3a1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -528,8 +528,12 @@ static long seccomp_attach_filter(unsigned int flags, int ret; ret = seccomp_can_sync_threads(); - if (ret) - return ret; + if (ret) { + if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) + return -ESRCH; + else + return ret; + } } /* Set log flag, if present. */ @@ -1221,6 +1225,7 @@ static const struct file_operations seccomp_notify_ops = { .poll = seccomp_notify_poll, .release = seccomp_notify_release, .unlocked_ioctl = seccomp_notify_ioctl, + .compat_ioctl = seccomp_notify_ioctl, }; static struct file *init_listener(struct seccomp_filter *filter) @@ -1288,10 +1293,12 @@ static long seccomp_set_mode_filter(unsigned int flags, * In the successful case, NEW_LISTENER returns the new listener fd. * But in the failure case, TSYNC returns the thread that died. If you * combine these two flags, there's no way to tell whether something - * succeeded or failed. So, let's disallow this combination. + * succeeded or failed. So, let's disallow this combination if the user + * has not explicitly requested no errors from TSYNC. */ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && - (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER)) + (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) && + ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0)) return -EINVAL; /* Prepare the new filter before holding any locks. */ diff --git a/kernel/sys.c b/kernel/sys.c index f9bc5c303e3f..d325f3ab624a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@ #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/time_namespace.h> #include <linux/binfmts.h> #include <linux/sched.h> @@ -2546,6 +2547,7 @@ static int do_sysinfo(struct sysinfo *info) memset(info, 0, sizeof(struct sysinfo)); ktime_get_boottime_ts64(&tp); + timens_add_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); diff --git a/kernel/task_work.c b/kernel/task_work.c index 0fef395662a6..825f28259a19 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -97,16 +97,26 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ - raw_spin_lock_irq(&task->pi_lock); do { + head = NULL; work = READ_ONCE(task->task_works); - head = !work && (task->flags & PF_EXITING) ? - &work_exited : NULL; + if (!work) { + if (task->flags & PF_EXITING) + head = &work_exited; + else + break; + } } while (cmpxchg(&task->task_works, work, head) != work); - raw_spin_unlock_irq(&task->pi_lock); if (!work) break; + /* + * Synchronize with task_work_cancel(). It can not remove + * the first entry == work, cmpxchg(task_works) must fail. + * But it can remove another entry from the ->next list. + */ + raw_spin_lock_irq(&task->pi_lock); + raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3a609e7344f3..8cce72501aea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1404,7 +1404,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; - timer->is_hard = !softtimer; + timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } @@ -1514,7 +1514,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); + lockdep_hrtimer_enter(timer); + restart = fn(timer); + + lockdep_hrtimer_exit(timer); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index d23b434c2ca7..eddcf4970444 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -58,7 +58,8 @@ static struct clocksource clocksource_jiffies = { .max_cycles = 10, }; -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); +__cacheline_aligned_in_smp seqcount_t jiffies_seq; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) @@ -67,9 +68,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); ret = jiffies_64; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8ff6da77a01f..2c48a7233b19 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1126,8 +1126,11 @@ void run_posix_cpu_timers(void) if (!fastpath_timer_check(tsk)) return; - if (!lock_task_sighand(tsk, &flags)) + lockdep_posixtimer_enter(); + if (!lock_task_sighand(tsk, &flags)) { + lockdep_posixtimer_exit(); return; + } /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1169,6 +1172,7 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + lockdep_posixtimer_exit(); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7e5d3524e924..6c9c342dd0e5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -84,13 +84,15 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -162,9 +164,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); next = tick_next_period; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a792d21cac64..3e2dc9b8858c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -65,7 +65,8 @@ static void tick_do_update_jiffies64(ktime_t now) return; /* Reevaluate with jiffies_lock held */ - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, last_jiffies_update); if (delta >= tick_period) { @@ -91,10 +92,12 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } else { - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return; } - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -105,12 +108,14 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ if (last_jiffies_update == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return period; } @@ -240,6 +245,7 @@ static void nohz_full_kick_func(struct irq_work *work) static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_func, + .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ), }; /* @@ -676,10 +682,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ca69290bee2a..856280d2cbd4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2397,8 +2397,10 @@ EXPORT_SYMBOL(hardpps); */ void xtime_update(unsigned long ticks) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); do_timer(ticks); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 141ab3ab0354..099737f6f10c 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -25,7 +25,8 @@ static inline void sched_clock_resume(void) { } extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -extern seqlock_t jiffies_lock; +extern raw_spinlock_t jiffies_lock; +extern seqcount_t jiffies_seq; #define CS_NAME_LEN 32 diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4820823515e9..568564ae3597 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -944,6 +944,7 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, #define MOD_TIMER_PENDING_ONLY 0x01 #define MOD_TIMER_REDUCE 0x02 +#define MOD_TIMER_NOTPENDING 0x04 static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) @@ -960,7 +961,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return: */ - if (timer_pending(timer)) { + if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new @@ -1133,7 +1134,7 @@ EXPORT_SYMBOL(timer_reduce); void add_timer(struct timer_list *timer) { BUG_ON(timer_pending(timer)); - mod_timer(timer, timer->expires); + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer); @@ -1891,7 +1892,7 @@ signed long __sched schedule_timeout(signed long timeout) timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, 0); + __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); schedule(); del_singleshot_timer_sync(&timer.timer); diff --git a/kernel/torture.c b/kernel/torture.c index 7c13f5558b71..8683375dc0c7 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -42,6 +42,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); +static bool disable_onoff_at_boot; +module_param(disable_onoff_at_boot, bool, 0444); + static char *torture_type; static int verbose; @@ -84,6 +87,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, { unsigned long delta; int ret; + char *s; unsigned long starttime; if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) @@ -99,10 +103,16 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, (*n_offl_attempts)++; ret = cpu_down(cpu); if (ret) { + s = ""; + if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { + // PCI probe frequently disables hotplug during boot. + (*n_offl_attempts)--; + s = " (-EBUSY forgiven during boot)"; + } if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); + "torture_onoff task: offline %d failed%s: errno %d\n", + torture_type, cpu, s, ret); } else { if (verbose > 1) pr_alert("%s" TORTURE_FLAG @@ -137,6 +147,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, { unsigned long delta; int ret; + char *s; unsigned long starttime; if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) @@ -150,10 +161,16 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, (*n_onl_attempts)++; ret = cpu_up(cpu); if (ret) { + s = ""; + if (!rcu_inkernel_boot_has_ended() && ret == -EBUSY) { + // PCI probe frequently disables hotplug during boot. + (*n_onl_attempts)--; + s = " (-EBUSY forgiven during boot)"; + } if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); + "torture_onoff task: online %d failed%s: errno %d\n", + torture_type, cpu, s, ret); } else { if (verbose > 1) pr_alert("%s" TORTURE_FLAG @@ -215,6 +232,10 @@ torture_onoff(void *arg) VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); } while (!torture_must_stop()) { + if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) { + schedule_timeout_interruptible(HZ / 10); + continue; + } cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); if (!torture_offline(cpu, &n_offline_attempts, &n_offline_successes, diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4560878f0bac..ca39dc3230cb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1896,8 +1896,11 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (bt == NULL) + if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); + } if (ret == 0) { if (attr == &dev_attr_act_mask) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19e793aa441a..68250d433bd7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -732,7 +732,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (in_nmi()) { + if (irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f7ee102868a..fd81c7de77a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1547,6 +1547,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) rec = bsearch(&key, pg->records, pg->index, sizeof(struct dyn_ftrace), ftrace_cmp_recs); + if (rec) + break; } return rec; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 301db4406bc3..4e01c448b4b4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1411,14 +1411,16 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; rcu_read_lock(); retry: - if (req_cpu == WORK_CPU_UNBOUND) - cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - /* pwq which will be used unless @work is executing elsewhere */ - if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - else + if (wq->flags & WQ_UNBOUND) { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); + } else { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + } /* * If @work was previously on a different pool, it might still be |