summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c1
-rw-r--r--kernel/bpf/lpm_trie.c1
-rw-r--r--kernel/bpf/stackmap.c1
-rw-r--r--kernel/bpf/syscall.c5
-rw-r--r--kernel/bpf/verifier.c187
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/kprobes.c10
-rw-r--r--kernel/livepatch/Kconfig1
-rw-r--r--kernel/locking/rtmutex.c24
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c29
-rw-r--r--kernel/ptrace.c20
-rw-r--r--kernel/sched/core.c25
-rw-r--r--kernel/sched/cpufreq_schedutil.c7
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/time/posix-cpu-timers.c24
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/ftrace.c14
-rw-r--r--kernel/trace/trace.c34
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_kprobe.c5
25 files changed, 316 insertions, 118 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 5e00b2333c26..172dc8ee0e3b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
+ array->map.map_flags = attr->map_flags;
array->elem_size = elem_size;
if (!percpu)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 39cfafd895b8..b09185f0f17d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
trie->map.key_size = attr->key_size;
trie->map.value_size = attr->value_size;
trie->map.max_entries = attr->max_entries;
+ trie->map.map_flags = attr->map_flags;
trie->data_size = attr->key_size -
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4dfd6f2ec2f9..31147d730abf 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
smap->map.key_size = attr->key_size;
smap->map.value_size = value_size;
smap->map.max_entries = attr->max_entries;
+ smap->map.map_flags = attr->map_flags;
smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fd2411fd6914..265a0d854e33 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -783,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
EXPORT_SYMBOL_GPL(bpf_prog_get_type);
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD kern_version
+#define BPF_PROG_LOAD_LAST_FIELD prog_flags
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -796,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
+ if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+ return -EINVAL;
+
/* copy eBPF program license from user space */
if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c5b56c92f8e2..339c8a1371de 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -140,7 +140,7 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 65536
+#define BPF_COMPLEXITY_LIMIT_INSNS 98304
#define BPF_COMPLEXITY_LIMIT_STACK 1024
#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
@@ -241,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state)
if (reg->max_value != BPF_REGISTER_MAX_RANGE)
verbose(",max_value=%llu",
(unsigned long long)reg->max_value);
+ if (reg->min_align)
+ verbose(",min_align=%u", reg->min_align);
+ if (reg->aux_off)
+ verbose(",aux_off=%u", reg->aux_off);
+ if (reg->aux_off_align)
+ verbose(",aux_off_align=%u", reg->aux_off_align);
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
@@ -457,16 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
+static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+
+ memset(&regs[regno], 0, sizeof(regs[regno]));
+ regs[regno].type = NOT_INIT;
+ regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
+ regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+}
+
static void init_reg_state(struct bpf_reg_state *regs)
{
int i;
- for (i = 0; i < MAX_BPF_REG; i++) {
- regs[i].type = NOT_INIT;
- regs[i].imm = 0;
- regs[i].min_value = BPF_REGISTER_MIN_RANGE;
- regs[i].max_value = BPF_REGISTER_MAX_RANGE;
- }
+ for (i = 0; i < MAX_BPF_REG; i++)
+ mark_reg_not_init(regs, i);
/* frame pointer */
regs[BPF_REG_FP].type = FRAME_PTR;
@@ -492,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
{
regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+ regs[regno].min_align = 0;
}
static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
@@ -779,17 +792,37 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
}
static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
- int off, int size)
+ int off, int size, bool strict)
{
- if (reg->id && size != 1) {
- verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");
- return -EACCES;
+ int ip_align;
+ int reg_off;
+
+ /* Byte size accesses are always allowed. */
+ if (!strict || size == 1)
+ return 0;
+
+ reg_off = reg->off;
+ if (reg->id) {
+ if (reg->aux_off_align % size) {
+ verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
+ reg->aux_off_align, size);
+ return -EACCES;
+ }
+ reg_off += reg->aux_off;
}
- /* skb->data is NET_IP_ALIGN-ed */
- if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
+ /* For platforms that do not have a Kconfig enabling
+ * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
+ * NET_IP_ALIGN is universally set to '2'. And on platforms
+ * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get
+ * to this code only in strict mode where we want to emulate
+ * the NET_IP_ALIGN==2 checking. Therefore use an
+ * unconditional IP align value of '2'.
+ */
+ ip_align = 2;
+ if ((ip_align + reg_off + off) % size != 0) {
verbose("misaligned packet access off %d+%d+%d size %d\n",
- NET_IP_ALIGN, reg->off, off, size);
+ ip_align, reg_off, off, size);
return -EACCES;
}
@@ -797,9 +830,9 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
}
static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
- int size)
+ int size, bool strict)
{
- if (size != 1) {
+ if (strict && size != 1) {
verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
return -EACCES;
}
@@ -807,16 +840,17 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
return 0;
}
-static int check_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
int off, int size)
{
+ bool strict = env->strict_alignment;
+
switch (reg->type) {
case PTR_TO_PACKET:
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
- check_pkt_ptr_alignment(reg, off, size);
+ return check_pkt_ptr_alignment(reg, off, size, strict);
case PTR_TO_MAP_VALUE_ADJ:
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
- check_val_ptr_alignment(reg, size);
+ return check_val_ptr_alignment(reg, size, strict);
default:
if (off % size != 0) {
verbose("misaligned access off %d size %d\n",
@@ -849,7 +883,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
if (size < 0)
return size;
- err = check_ptr_alignment(reg, off, size);
+ err = check_ptr_alignment(env, reg, off, size);
if (err)
return err;
@@ -883,6 +917,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
value_regno);
/* note that reg.[id|off|range] == 0 */
state->regs[value_regno].type = reg_type;
+ state->regs[value_regno].aux_off = 0;
+ state->regs[value_regno].aux_off_align = 0;
}
} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -1313,7 +1349,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
struct bpf_reg_state *regs = state->regs;
- struct bpf_reg_state *reg;
struct bpf_call_arg_meta meta;
bool changes_data;
int i, err;
@@ -1380,11 +1415,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
}
/* reset caller saved regs */
- for (i = 0; i < CALLER_SAVED_REGS; i++) {
- reg = regs + caller_saved[i];
- reg->type = NOT_INIT;
- reg->imm = 0;
- }
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(regs, caller_saved[i]);
/* update return register */
if (fn->ret_type == RET_INTEGER) {
@@ -1455,6 +1487,8 @@ add_imm:
*/
dst_reg->off += imm;
} else {
+ bool had_id;
+
if (src_reg->type == PTR_TO_PACKET) {
/* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
tmp_reg = *dst_reg; /* save r7 state */
@@ -1488,14 +1522,23 @@ add_imm:
src_reg->imm);
return -EACCES;
}
+
+ had_id = (dst_reg->id != 0);
+
/* dst_reg stays as pkt_ptr type and since some positive
* integer value was added to the pointer, increment its 'id'
*/
dst_reg->id = ++env->id_gen;
- /* something was added to pkt_ptr, set range and off to zero */
+ /* something was added to pkt_ptr, set range to zero */
+ dst_reg->aux_off += dst_reg->off;
dst_reg->off = 0;
dst_reg->range = 0;
+ if (had_id)
+ dst_reg->aux_off_align = min(dst_reg->aux_off_align,
+ src_reg->min_align);
+ else
+ dst_reg->aux_off_align = src_reg->min_align;
}
return 0;
}
@@ -1669,6 +1712,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg)
reg->min_value = BPF_REGISTER_MIN_RANGE;
}
+static u32 calc_align(u32 imm)
+{
+ if (!imm)
+ return 1U << 31;
+ return imm - ((imm - 1) & imm);
+}
+
static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
@@ -1676,8 +1726,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
s64 min_val = BPF_REGISTER_MIN_RANGE;
u64 max_val = BPF_REGISTER_MAX_RANGE;
u8 opcode = BPF_OP(insn->code);
+ u32 dst_align, src_align;
dst_reg = &regs[insn->dst_reg];
+ src_align = 0;
if (BPF_SRC(insn->code) == BPF_X) {
check_reg_overflow(&regs[insn->src_reg]);
min_val = regs[insn->src_reg].min_value;
@@ -1693,12 +1745,18 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
regs[insn->src_reg].type != UNKNOWN_VALUE) {
min_val = BPF_REGISTER_MIN_RANGE;
max_val = BPF_REGISTER_MAX_RANGE;
+ src_align = 0;
+ } else {
+ src_align = regs[insn->src_reg].min_align;
}
} else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
(s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
min_val = max_val = insn->imm;
+ src_align = calc_align(insn->imm);
}
+ dst_align = dst_reg->min_align;
+
/* We don't know anything about what was done to this register, mark it
* as unknown.
*/
@@ -1723,18 +1781,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
dst_reg->min_value += min_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
dst_reg->max_value += max_val;
+ dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_SUB:
if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
dst_reg->min_value -= min_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
dst_reg->max_value -= max_val;
+ dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_MUL:
if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
dst_reg->min_value *= min_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
dst_reg->max_value *= max_val;
+ dst_reg->min_align = max(src_align, dst_align);
break;
case BPF_AND:
/* Disallow AND'ing of negative numbers, ain't nobody got time
@@ -1746,17 +1807,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
else
dst_reg->min_value = 0;
dst_reg->max_value = max_val;
+ dst_reg->min_align = max(src_align, dst_align);
break;
case BPF_LSH:
/* Gotta have special overflow logic here, if we're shifting
* more than MAX_RANGE then just assume we have an invalid
* range.
*/
- if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
+ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value <<= min_val;
-
+ dst_reg->min_align = 1;
+ } else {
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value <<= min_val;
+ if (!dst_reg->min_align)
+ dst_reg->min_align = 1;
+ dst_reg->min_align <<= min_val;
+ }
if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
@@ -1766,11 +1833,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* RSH by a negative number is undefined, and the BPF_RSH is an
* unsigned shift, so make the appropriate casts.
*/
- if (min_val < 0 || dst_reg->min_value < 0)
+ if (min_val < 0 || dst_reg->min_value < 0) {
dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- else
+ } else {
dst_reg->min_value =
(u64)(dst_reg->min_value) >> min_val;
+ }
+ if (min_val < 0) {
+ dst_reg->min_align = 1;
+ } else {
+ dst_reg->min_align >>= (u64) min_val;
+ if (!dst_reg->min_align)
+ dst_reg->min_align = 1;
+ }
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
dst_reg->max_value >>= max_val;
break;
@@ -1872,6 +1947,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
regs[insn->dst_reg].imm = insn->imm;
regs[insn->dst_reg].max_value = insn->imm;
regs[insn->dst_reg].min_value = insn->imm;
+ regs[insn->dst_reg].min_align = calc_align(insn->imm);
}
} else if (opcode > BPF_END) {
@@ -2368,7 +2444,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = env->cur_state.regs;
u8 mode = BPF_MODE(insn->code);
- struct bpf_reg_state *reg;
int i, err;
if (!may_access_skb(env->prog->type)) {
@@ -2401,11 +2476,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* reset caller saved regs to unreadable */
- for (i = 0; i < CALLER_SAVED_REGS; i++) {
- reg = regs + caller_saved[i];
- reg->type = NOT_INIT;
- reg->imm = 0;
- }
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(regs, caller_saved[i]);
/* mark destination R0 register as readable, since it contains
* the value fetched from the packet
@@ -2564,6 +2636,7 @@ peek_stack:
env->explored_states[t + 1] = STATE_LIST_MARK;
} else {
/* conditional jump with two edges */
+ env->explored_states[t] = STATE_LIST_MARK;
ret = push_insn(t, t + 1, FALLTHROUGH, env);
if (ret == 1)
goto peek_stack;
@@ -2615,7 +2688,8 @@ err_free:
/* the following conditions reduce the number of explored insns
* from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
*/
-static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
+static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
+ struct bpf_reg_state *old,
struct bpf_reg_state *cur)
{
if (old->id != cur->id)
@@ -2658,7 +2732,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
* 'if (R4 > data_end)' and all further insn were already good with r=20,
* so they will be good with r=30 and we can prune the search.
*/
- if (old->off <= cur->off &&
+ if (!env->strict_alignment && old->off <= cur->off &&
old->off >= old->range && cur->off >= cur->range)
return true;
@@ -2722,8 +2796,14 @@ static bool states_equal(struct bpf_verifier_env *env,
rcur->type != NOT_INIT))
continue;
+ /* Don't care about the reg->id in this case. */
+ if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
+ rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
+ rold->map_ptr == rcur->map_ptr)
+ continue;
+
if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
- compare_ptrs_to_packet(rold, rcur))
+ compare_ptrs_to_packet(env, rold, rcur))
continue;
return false;
@@ -2856,8 +2936,15 @@ static int do_check(struct bpf_verifier_env *env)
goto process_bpf_exit;
}
- if (log_level && do_print_state) {
- verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ if (need_resched())
+ cond_resched();
+
+ if (log_level > 1 || (log_level && do_print_state)) {
+ if (log_level > 1)
+ verbose("%d:", insn_idx);
+ else
+ verbose("\nfrom %d to %d:",
+ prev_insn_idx, insn_idx);
print_verifier_state(&env->cur_state);
do_print_state = false;
}
@@ -3495,6 +3582,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
log_level = 0;
}
+ env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ env->strict_alignment = true;
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -3600,6 +3691,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
log_level = 0;
+ env->strict_alignment = false;
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ env->strict_alignment = true;
+
env->explored_states = kcalloc(env->prog->len,
sizeof(struct bpf_verifier_state_list *),
GFP_KERNEL);
diff --git a/kernel/fork.c b/kernel/fork.c
index 06d759ab4c62..e53770d2bf95 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1577,6 +1577,18 @@ static __latent_entropy struct task_struct *copy_process(
if (!p)
goto fork_out;
+ /*
+ * This _must_ happen before we call free_task(), i.e. before we jump
+ * to any of the bad_fork_* labels. This is to avoid freeing
+ * p->set_child_tid which is (ab)used as a kthread's data pointer for
+ * kernel threads (PF_KTHREAD).
+ */
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+ /*
+ * Clear TID on mm_release()?
+ */
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+
ftrace_graph_init_task(p);
rt_mutex_init_task(p);
@@ -1743,11 +1755,6 @@ static __latent_entropy struct task_struct *copy_process(
}
}
- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
- /*
- * Clear TID on mm_release()?
- */
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
@@ -1845,11 +1852,13 @@ static __latent_entropy struct task_struct *copy_process(
*/
recalc_sigpending();
if (signal_pending(current)) {
- spin_unlock(&current->sighand->siglock);
- write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
+ if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ retval = -ENOMEM;
+ goto bad_fork_cancel_cgroup;
+ }
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -1907,6 +1916,8 @@ static __latent_entropy struct task_struct *copy_process(
return p;
bad_fork_cancel_cgroup:
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p);
bad_fork_free_pid:
cgroup_threadgroup_change_end(current);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 686be4b73018..c94da688ee9b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -880,8 +880,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
if (!desc)
return;
- __irq_do_set_handler(desc, handle, 1, NULL);
desc->irq_common_data.handler_data = data;
+ __irq_do_set_handler(desc, handle, 1, NULL);
irq_put_desc_busunlock(desc, flags);
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7367e0ec6f81..adfe3b4cfe05 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -122,7 +122,7 @@ static void *alloc_insn_page(void)
return module_alloc(PAGE_SIZE);
}
-static void free_insn_page(void *page)
+void __weak free_insn_page(void *page)
{
module_memfree(page);
}
@@ -595,7 +595,7 @@ static void kprobe_optimizer(struct work_struct *work)
}
/* Wait for completing optimization and unoptimization */
-static void wait_for_kprobe_optimizer(void)
+void wait_for_kprobe_optimizer(void)
{
mutex_lock(&kprobe_mutex);
@@ -2183,6 +2183,12 @@ static int kprobes_module_callback(struct notifier_block *nb,
* The vaddr this probe is installed will soon
* be vfreed buy not synced to disk. Hence,
* disarming the breakpoint isn't needed.
+ *
+ * Note, this will also move any optimized probes
+ * that are pending to be removed from their
+ * corresponding lists to the freeing_list and
+ * will not be touched by the delayed
+ * kprobe_optimizer work handler.
*/
kill_kprobe(p);
}
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 045022557936..ec4565122e65 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -10,6 +10,7 @@ config LIVEPATCH
depends on SYSFS
depends on KALLSYMS_ALL
depends on HAVE_LIVEPATCH
+ depends on !TRIM_UNUSED_KSYMS
help
Say Y here if you want to support kernel live patching.
This option has no runtime impact until a kernel "patch"
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b95509416909..28cd09e635ed 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1785,12 +1785,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
int ret;
raw_spin_lock_irq(&lock->wait_lock);
-
- set_current_state(TASK_INTERRUPTIBLE);
-
/* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -1822,15 +1824,25 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
/*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
* Unless we're the owner; we're still enqueued on the wait_list.
* So check if we became owner, if not, take us off the wait_list.
*/
if (rt_mutex_owner(lock) != current) {
remove_waiter(lock, waiter);
- fixup_rt_mutex_waiters(lock);
cleanup = true;
}
-
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d1f3e9f558b8..74a5a7255b4d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* if reparented.
*/
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->nr_hashed == init_pids)
break;
schedule();
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78672d324a6e..c7209f060eeb 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,7 +132,7 @@ int freeze_processes(void)
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
- pm_wakeup_clear(true);
+ pm_wakeup_clear();
pr_info("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3b1e0f3ad07f..fa46606f3356 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1425,7 +1425,7 @@ static unsigned int nr_meta_pages;
* Numbers of normal and highmem page frames allocated for hibernation image
* before suspending devices.
*/
-unsigned int alloc_normal, alloc_highmem;
+static unsigned int alloc_normal, alloc_highmem;
/*
* Memory bitmap used for marking saveable pages (during hibernation) or
* hibernation image pages (during restore)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c0248c74d6d4..15e6baef5c73 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -72,8 +72,6 @@ static void freeze_begin(void)
static void freeze_enter(void)
{
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
-
spin_lock_irq(&suspend_freeze_lock);
if (pm_wakeup_pending())
goto out;
@@ -100,27 +98,6 @@ static void freeze_enter(void)
out:
suspend_freeze_state = FREEZE_STATE_NONE;
spin_unlock_irq(&suspend_freeze_lock);
-
- trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
-}
-
-static void s2idle_loop(void)
-{
- do {
- freeze_enter();
-
- if (freeze_ops && freeze_ops->wake)
- freeze_ops->wake();
-
- dpm_resume_noirq(PMSG_RESUME);
- if (freeze_ops && freeze_ops->sync)
- freeze_ops->sync();
-
- if (pm_wakeup_pending())
- break;
-
- pm_wakeup_clear(false);
- } while (!dpm_suspend_noirq(PMSG_SUSPEND));
}
void freeze_wake(void)
@@ -394,8 +371,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
* all the devices are suspended.
*/
if (state == PM_SUSPEND_FREEZE) {
- s2idle_loop();
- goto Platform_early_resume;
+ trace_suspend_resume(TPS("machine_suspend"), state, true);
+ freeze_enter();
+ trace_suspend_resume(TPS("machine_suspend"), state, false);
+ goto Platform_wake;
}
error = disable_nonboot_cpus();
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 266ddcc1d8bb..60f356d91060 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
}
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
+ const struct cred *ptracer_cred)
+{
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
+ child->parent = new_parent;
+ child->ptracer_cred = get_cred(ptracer_cred);
+}
+
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- BUG_ON(!list_empty(&child->ptrace_entry));
- list_add(&child->ptrace_entry, &new_parent->ptraced);
- child->parent = new_parent;
rcu_read_lock();
- child->ptracer_cred = get_cred(__task_cred(new_parent));
+ __ptrace_link(child, new_parent, __task_cred(new_parent));
rcu_read_unlock();
}
@@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request,
flags |= PT_SEIZED;
task->ptrace = flags;
- __ptrace_link(task, current);
+ ptrace_link(task, current);
/* SEIZE doesn't trap tracee on attach */
if (!seize)
@@ -459,7 +465,7 @@ static int ptrace_traceme(void)
*/
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
current->ptrace = PT_PTRACED;
- __ptrace_link(current, current->real_parent);
+ ptrace_link(current, current->real_parent);
}
}
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759f4bd52cd6..803c3bc274c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3502,6 +3502,31 @@ asmlinkage __visible void __sched schedule(void)
}
EXPORT_SYMBOL(schedule);
+/*
+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
+ * state (have scheduled out non-voluntarily) by making sure that all
+ * tasks have either left the run queue or have gone into user space.
+ * As idle tasks do not do either, they must not ever be preempted
+ * (schedule out non-voluntarily).
+ *
+ * schedule_idle() is similar to schedule_preempt_disable() except that it
+ * never enables preemption because it does not call sched_submit_work().
+ */
+void __sched schedule_idle(void)
+{
+ /*
+ * As this skips calling sched_submit_work(), which the idle task does
+ * regardless because that function is a nop when the task is in a
+ * TASK_RUNNING state, make sure this isn't used someplace that the
+ * current task can be in any other state. Note, idle is always in the
+ * TASK_RUNNING state.
+ */
+ WARN_ON_ONCE(current->state);
+ do {
+ __schedule(false);
+ } while (need_resched());
+}
+
#ifdef CONFIG_CONTEXT_TRACKING
asmlinkage __visible void __sched schedule_user(void)
{
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 76877a62b5fa..622eed1b7658 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -245,11 +245,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- u64 last_freq_update_time = sg_policy->last_freq_update_time;
unsigned long util = 0, max = 1;
unsigned int j;
@@ -265,7 +264,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
* enough, don't take the CPU into account as it probably is
* idle now (and clear iowait_boost for it).
*/
- delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+ delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
continue;
@@ -309,7 +308,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
if (flags & SCHED_CPUFREQ_RT_DL)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
- next_f = sugov_next_freq_shared(sg_cpu);
+ next_f = sugov_next_freq_shared(sg_cpu, time);
sugov_update_commit(sg_policy, time, next_f);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2a25a9ec2c6e..ef63adce0c9c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -265,7 +265,7 @@ static void do_idle(void)
smp_mb__after_atomic();
sched_ttwu_pending();
- schedule_preempt_disabled();
+ schedule_idle();
if (unlikely(klp_patch_pending(current)))
klp_update_patch_state(current);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7808ab050599..6dda2aab731e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1467,6 +1467,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
}
#endif
+extern void schedule_idle(void);
+
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1370f067fb51..d2a1e6dd0291 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -825,8 +825,10 @@ static void check_thread_timers(struct task_struct *tsk,
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
- pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
- tsk->comm, task_pid_nr(tsk));
+ if (print_fatal_signals) {
+ pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ }
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
@@ -838,8 +840,10 @@ static void check_thread_timers(struct task_struct *tsk,
soft += USEC_PER_SEC;
sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
}
- pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
- tsk->comm, task_pid_nr(tsk));
+ if (print_fatal_signals) {
+ pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ }
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
}
@@ -936,8 +940,10 @@ static void check_process_timers(struct task_struct *tsk,
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
- pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
- tsk->comm, task_pid_nr(tsk));
+ if (print_fatal_signals) {
+ pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ }
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
@@ -945,8 +951,10 @@ static void check_process_timers(struct task_struct *tsk,
/*
* At the soft limit, send a SIGXCPU every second.
*/
- pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
- tsk->comm, task_pid_nr(tsk));
+ if (print_fatal_signals) {
+ pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ }
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
if (soft < hard) {
soft++;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bd8ae8d5ae9c..193c5f5e3f79 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1662,14 +1662,14 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
goto out;
if (attr == &dev_attr_act_mask) {
- if (sscanf(buf, "%llx", &value) != 1) {
+ if (kstrtoull(buf, 0, &value)) {
/* Assume it is a list of trace category names */
ret = blk_trace_str2mask(buf);
if (ret < 0)
goto out;
value = ret;
}
- } else if (sscanf(buf, "%llu", &value) != 1)
+ } else if (kstrtoull(buf, 0, &value))
goto out;
ret = -ENXIO;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 39dca4e86a94..9e5841dc14b5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4144,9 +4144,9 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
int i, ret = -ENODEV;
int size;
- if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
+ if (!glob || !strlen(glob) || !strcmp(glob, "*"))
func_g.search = NULL;
- else if (glob) {
+ else {
int not;
func_g.type = filter_parse_regex(glob, strlen(glob),
@@ -4256,6 +4256,14 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
return ret;
}
+void clear_ftrace_function_probes(struct trace_array *tr)
+{
+ struct ftrace_func_probe *probe, *n;
+
+ list_for_each_entry_safe(probe, n, &tr->func_probes, list)
+ unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops);
+}
+
static LIST_HEAD(ftrace_commands);
static DEFINE_MUTEX(ftrace_cmd_mutex);
@@ -5055,7 +5063,7 @@ ftrace_graph_release(struct inode *inode, struct file *file)
}
out:
- kfree(fgd->new_hash);
+ free_ftrace_hash(fgd->new_hash);
kfree(fgd);
return ret;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c4536c449021..1122f151466f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1558,7 +1558,7 @@ static __init int init_trace_selftests(void)
return 0;
}
-early_initcall(init_trace_selftests);
+core_initcall(init_trace_selftests);
#else
static inline int run_tracer_selftest(struct tracer *type)
{
@@ -2568,7 +2568,36 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc)
{
- __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+
+ if (rcu_is_watching()) {
+ __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ return;
+ }
+
+ /*
+ * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
+ * but if the above rcu_is_watching() failed, then the NMI
+ * triggered someplace critical, and rcu_irq_enter() should
+ * not be called from NMI.
+ */
+ if (unlikely(in_nmi()))
+ return;
+
+ /*
+ * It is possible that a function is being traced in a
+ * location that RCU is not watching. A call to
+ * rcu_irq_enter() will make sure that it is, but there's
+ * a few internal rcu functions that could be traced
+ * where that wont work either. In those cases, we just
+ * do nothing.
+ */
+ if (unlikely(rcu_irq_enter_disabled()))
+ return;
+
+ rcu_irq_enter_irqson();
+ __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+ rcu_irq_exit_irqson();
}
/**
@@ -7550,6 +7579,7 @@ static int instance_rmdir(const char *name)
}
tracing_set_nop(tr);
+ clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 291a1bca5748..39fd77330aab 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -980,6 +980,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
extern int
unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
struct ftrace_probe_ops *ops);
+extern void clear_ftrace_function_probes(struct trace_array *tr);
int register_ftrace_command(struct ftrace_func_command *cmd);
int unregister_ftrace_command(struct ftrace_func_command *cmd);
@@ -998,6 +999,10 @@ static inline __init int unregister_ftrace_command(char *cmd_name)
{
return -EINVAL;
}
+static inline void clear_ftrace_function_probes(struct trace_array *tr)
+{
+}
+
/*
* The ops parameter passed in is usually undefined.
* This must be a macro.
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8485f6738a87..c129fca6ec99 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1535,6 +1535,11 @@ static __init int kprobe_trace_self_tests_init(void)
end:
release_all_trace_kprobes();
+ /*
+ * Wait for the optimizer work to finish. Otherwise it might fiddle
+ * with probes in already freed __init text.
+ */
+ wait_for_kprobe_optimizer();
if (warn)
pr_cont("NG: Some tests are failed. Please check them.\n");
else