From 4a00e9df293d010acbea118b9521e08cb85016c6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 25 Jun 2015 15:00:51 -0700 Subject: prctl: more prctl(PR_SET_MM_*) checks Individual prctl(PR_SET_MM_*) calls do some checking to maintain a consistent view of mm->arg_start et al fields, but not enough. In particular PR_SET_MM_ARG_START/PR_SET_MM_ARG_END/ R_SET_MM_ENV_START/ PR_SET_MM_ENV_END only check that the address lies in an existing VMA, but don't check that the start address is lower than the end address _at all_. Consolidate all consistency checks, so there will be no difference in the future between PR_SET_MM_MAP and individual PR_SET_MM_* calls. The program below makes both ARGV and ENVP areas be reversed. It makes /proc/$PID/cmdline show garbage (it doesn't oops by luck). #include #include #include enum {PAGE_SIZE=4096}; int main(void) { void *p; p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); #define PR_SET_MM 35 #define PR_SET_MM_ARG_START 8 #define PR_SET_MM_ARG_END 9 #define PR_SET_MM_ENV_START 10 #define PR_SET_MM_ENV_END 11 prctl(PR_SET_MM, PR_SET_MM_ARG_START, (unsigned long)p + PAGE_SIZE - 1, 0, 0); prctl(PR_SET_MM, PR_SET_MM_ARG_END, (unsigned long)p, 0, 0); prctl(PR_SET_MM, PR_SET_MM_ENV_START, (unsigned long)p + PAGE_SIZE - 1, 0, 0); prctl(PR_SET_MM, PR_SET_MM_ENV_END, (unsigned long)p, 0, 0); pause(); return 0; } [akpm@linux-foundation.org: tidy code, tweak comment] Signed-off-by: Alexey Dobriyan Acked-by: Cyrill Gorcunov Cc: Jarod Wilson Cc: Jan Stancek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 158 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 91 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 8571296b7ddb..259fda25eb6b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1722,7 +1722,6 @@ exit_err: goto exit; } -#ifdef CONFIG_CHECKPOINT_RESTORE /* * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. @@ -1818,6 +1817,7 @@ out: return error; } +#ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) { struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; @@ -1902,10 +1902,41 @@ out: } #endif /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + /* + * This doesn't move the auxiliary vector itself since it's pinned to + * mm_struct, but it permits filling the vector with new values. It's + * up to the caller to provide sane values here, otherwise userspace + * tools which use this vector might be unhappy. + */ + unsigned long user_auxv[AT_VECTOR_SIZE]; + + if (len > sizeof(user_auxv)) + return -EINVAL; + + if (copy_from_user(user_auxv, (const void __user *)addr, len)) + return -EFAULT; + + /* Make sure the last entry is always AT_NULL */ + user_auxv[AT_VECTOR_SIZE - 2] = 0; + user_auxv[AT_VECTOR_SIZE - 1] = 0; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + + task_lock(current); + memcpy(mm->saved_auxv, user_auxv, len); + task_unlock(current); + + return 0; +} + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { struct mm_struct *mm = current->mm; + struct prctl_mm_map prctl_map; struct vm_area_struct *vma; int error; @@ -1925,6 +1956,9 @@ static int prctl_set_mm(int opt, unsigned long addr, if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (opt == PR_SET_MM_AUXV) + return prctl_set_auxv(mm, addr, arg4); + if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; @@ -1933,42 +1967,64 @@ static int prctl_set_mm(int opt, unsigned long addr, down_read(&mm->mmap_sem); vma = find_vma(mm, addr); + prctl_map.start_code = mm->start_code; + prctl_map.end_code = mm->end_code; + prctl_map.start_data = mm->start_data; + prctl_map.end_data = mm->end_data; + prctl_map.start_brk = mm->start_brk; + prctl_map.brk = mm->brk; + prctl_map.start_stack = mm->start_stack; + prctl_map.arg_start = mm->arg_start; + prctl_map.arg_end = mm->arg_end; + prctl_map.env_start = mm->env_start; + prctl_map.env_end = mm->env_end; + prctl_map.auxv = NULL; + prctl_map.auxv_size = 0; + prctl_map.exe_fd = -1; + switch (opt) { case PR_SET_MM_START_CODE: - mm->start_code = addr; + prctl_map.start_code = addr; break; case PR_SET_MM_END_CODE: - mm->end_code = addr; + prctl_map.end_code = addr; break; case PR_SET_MM_START_DATA: - mm->start_data = addr; + prctl_map.start_data = addr; break; case PR_SET_MM_END_DATA: - mm->end_data = addr; + prctl_map.end_data = addr; + break; + case PR_SET_MM_START_STACK: + prctl_map.start_stack = addr; break; - case PR_SET_MM_START_BRK: - if (addr <= mm->end_data) - goto out; - - if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, - mm->end_data, mm->start_data)) - goto out; - - mm->start_brk = addr; + prctl_map.start_brk = addr; break; - case PR_SET_MM_BRK: - if (addr <= mm->end_data) - goto out; - - if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, - mm->end_data, mm->start_data)) - goto out; - - mm->brk = addr; + prctl_map.brk = addr; break; + case PR_SET_MM_ARG_START: + prctl_map.arg_start = addr; + break; + case PR_SET_MM_ARG_END: + prctl_map.arg_end = addr; + break; + case PR_SET_MM_ENV_START: + prctl_map.env_start = addr; + break; + case PR_SET_MM_ENV_END: + prctl_map.env_end = addr; + break; + default: + goto out; + } + + error = validate_prctl_map(&prctl_map); + if (error) + goto out; + switch (opt) { /* * If command line arguments and environment * are placed somewhere else on stack, we can @@ -1985,52 +2041,20 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EFAULT; goto out; } - if (opt == PR_SET_MM_START_STACK) - mm->start_stack = addr; - else if (opt == PR_SET_MM_ARG_START) - mm->arg_start = addr; - else if (opt == PR_SET_MM_ARG_END) - mm->arg_end = addr; - else if (opt == PR_SET_MM_ENV_START) - mm->env_start = addr; - else if (opt == PR_SET_MM_ENV_END) - mm->env_end = addr; - break; - - /* - * This doesn't move auxiliary vector itself - * since it's pinned to mm_struct, but allow - * to fill vector with new values. It's up - * to a caller to provide sane values here - * otherwise user space tools which use this - * vector might be unhappy. - */ - case PR_SET_MM_AUXV: { - unsigned long user_auxv[AT_VECTOR_SIZE]; - - if (arg4 > sizeof(user_auxv)) - goto out; - up_read(&mm->mmap_sem); - - if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) - return -EFAULT; - - /* Make sure the last entry is always AT_NULL */ - user_auxv[AT_VECTOR_SIZE - 2] = 0; - user_auxv[AT_VECTOR_SIZE - 1] = 0; - - BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); - - task_lock(current); - memcpy(mm->saved_auxv, user_auxv, arg4); - task_unlock(current); - - return 0; - } - default: - goto out; } + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + error = 0; out: up_read(&mm->mmap_sem); -- cgit v1.2.3 From 3033f14ab78c326871a4902591c2518410add24a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 25 Jun 2015 15:01:19 -0700 Subject: clone: support passing tls argument via C rather than pt_regs magic clone has some of the quirkiest syscall handling in the kernel, with a pile of special cases, historical curiosities, and architecture-specific calling conventions. In particular, clone with CLONE_SETTLS accepts a parameter "tls" that the C entry point completely ignores and some assembly entry points overwrite; instead, the low-level arch-specific code pulls the tls parameter out of the arch-specific register captured as part of pt_regs on entry to the kernel. That's a massive hack, and it makes the arch-specific code only work when called via the specific existing syscall entry points; because of this hack, any new clone-like system call would have to accept an identical tls argument in exactly the same arch-specific position, rather than providing a unified system call entry point across architectures. The first patch allows architectures to handle the tls argument via normal C parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt into this. These two patches came out of the clone4 series, which isn't ready for this merge window, but these first two cleanup patches were entirely uncontroversial and have acks. I'd like to go ahead and submit these two so that other architectures can begin building on top of this and opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and send these through the next merge window (along with v3 of clone4) if anyone would prefer that. This patch (of 2): clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Patch co-authored by Josh Triplett and Thiago Macieira. Signed-off-by: Josh Triplett Acked-by: Andy Lutomirski Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thiago Macieira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 48 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0bb88b555550..4c95cb34243c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1238,7 +1238,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, - int trace) + int trace, + unsigned long tls) { int retval; struct task_struct *p; @@ -1447,7 +1448,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, stack_start, stack_size, p); + retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); if (retval) goto bad_fork_cleanup_io; @@ -1659,7 +1660,7 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1674,11 +1675,12 @@ struct task_struct *fork_idle(int cpu) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long do_fork(unsigned long clone_flags, +long _do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr) + int __user *child_tidptr, + unsigned long tls) { struct task_struct *p; int trace = 0; @@ -1703,7 +1705,7 @@ long do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace); + child_tidptr, NULL, trace, tls); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1744,20 +1746,34 @@ long do_fork(unsigned long clone_flags, return nr; } +#ifndef CONFIG_HAVE_COPY_THREAD_TLS +/* For compatibility with architectures that call do_fork directly rather than + * using the syscall entry points below. */ +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + return _do_fork(clone_flags, stack_start, stack_size, + parent_tidptr, child_tidptr, 0); +} +#endif + /* * Create a kernel thread. */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, - (unsigned long)arg, NULL, NULL); + return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, + (unsigned long)arg, NULL, NULL, 0); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return do_fork(SIGCHLD, 0, 0, NULL, NULL); + return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); #else /* can not support in nommu mode */ return -EINVAL; @@ -1768,8 +1784,8 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, - 0, NULL, NULL); + return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + 0, NULL, NULL, 0); } #endif @@ -1777,27 +1793,27 @@ SYSCALL_DEFINE0(vfork) #ifdef CONFIG_CLONE_BACKWARDS SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, - int, tls_val, + unsigned long, tls, int __user *, child_tidptr) #elif defined(CONFIG_CLONE_BACKWARDS2) SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #elif defined(CONFIG_CLONE_BACKWARDS3) SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, int, stack_size, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int __user *, child_tidptr, - int, tls_val) + unsigned long, tls) #endif { - return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); + return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); } #endif -- cgit v1.2.3 From d43ff430f434d862db59582c0f1f02382a678036 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Jun 2015 15:01:24 -0700 Subject: printk: guard the amount written per line by devkmsg_read() This patchset updates netconsole so that it can emit messages with the same header as used in /dev/kmsg which gives neconsole receiver full log information which enables things like structured logging and detection of lost messages. This patch (of 7): devkmsg_read() uses 8k buffer and assumes that the formatted output message won't overrun which seems safe given LOG_LINE_MAX, the current use of dict and the escaping method being used; however, we're planning to use devkmsg formatting wider and accounting for the buffer size properly isn't that complicated. This patch defines CONSOLE_EXT_LOG_MAX as 8192 and updates devkmsg_read() so that it limits output accordingly. Signed-off-by: Tejun Heo Cc: David Miller Cc: Kay Sievers Reviewed-by: Petr Mladek Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c099b082cd02..a11549034e07 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -505,6 +505,11 @@ int check_syslog_permissions(int type, bool from_file) return security_syslog(type); } +static void append_char(char **pp, char *e, char c) +{ + if (*pp < e) + *(*pp)++ = c; +} /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { @@ -512,7 +517,7 @@ struct devkmsg_user { u32 idx; enum log_flags prev; struct mutex lock; - char buf[8192]; + char buf[CONSOLE_EXT_LOG_MAX]; }; static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) @@ -570,6 +575,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, { struct devkmsg_user *user = file->private_data; struct printk_log *msg; + char *p, *e; u64 ts_usec; size_t i; char cont = '-'; @@ -579,6 +585,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (!user) return -EBADF; + p = user->buf; + e = user->buf + sizeof(user->buf); + ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; @@ -625,9 +634,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) cont = '+'; - len = sprintf(user->buf, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, - user->seq, ts_usec, cont); + p += scnprintf(p, e - p, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, + user->seq, ts_usec, cont); user->prev = msg->flags; /* escape non-printable characters */ @@ -635,11 +644,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, unsigned char c = log_text(msg)[i]; if (c < ' ' || c >= 127 || c == '\\') - len += sprintf(user->buf + len, "\\x%02x", c); + p += scnprintf(p, e - p, "\\x%02x", c); else - user->buf[len++] = c; + append_char(&p, e, c); } - user->buf[len++] = '\n'; + append_char(&p, e, '\n'); if (msg->dict_len) { bool line = true; @@ -648,30 +657,31 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, unsigned char c = log_dict(msg)[i]; if (line) { - user->buf[len++] = ' '; + append_char(&p, e, ' '); line = false; } if (c == '\0') { - user->buf[len++] = '\n'; + append_char(&p, e, '\n'); line = true; continue; } if (c < ' ' || c >= 127 || c == '\\') { - len += sprintf(user->buf + len, "\\x%02x", c); + p += scnprintf(p, e - p, "\\x%02x", c); continue; } - user->buf[len++] = c; + append_char(&p, e, c); } - user->buf[len++] = '\n'; + append_char(&p, e, '\n'); } user->idx = log_next(user->idx); user->seq++; raw_spin_unlock_irq(&logbuf_lock); + len = p - user->buf; if (len > count) { ret = -EINVAL; goto out; -- cgit v1.2.3 From 0a295e67ec19d59bdb146e0b60ac9659104f2215 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Jun 2015 15:01:27 -0700 Subject: printk: factor out message formatting from devkmsg_read() The extended message formatting used for /dev/kmsg will be used implement extended consoles. Factor out msg_print_ext_header() and msg_print_ext_body() from devkmsg_read(). This is pure restructuring. Signed-off-by: Tejun Heo Cc: David Miller Cc: Kay Sievers Reviewed-by: Petr Mladek Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 146 +++++++++++++++++++++++++++---------------------- 1 file changed, 80 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a11549034e07..51ce4f1838b3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -511,6 +511,81 @@ static void append_char(char **pp, char *e, char c) *(*pp)++ = c; } +static ssize_t msg_print_ext_header(char *buf, size_t size, + struct printk_log *msg, u64 seq, + enum log_flags prev_flags) +{ + u64 ts_usec = msg->ts_nsec; + char cont = '-'; + + do_div(ts_usec, 1000); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + return scnprintf(buf, size, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, seq, ts_usec, cont); +} + +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *dict, size_t dict_len, + char *text, size_t text_len) +{ + char *p = buf, *e = buf + size; + size_t i; + + /* escape non-printable characters */ + for (i = 0; i < text_len; i++) { + unsigned char c = text[i]; + + if (c < ' ' || c >= 127 || c == '\\') + p += scnprintf(p, e - p, "\\x%02x", c); + else + append_char(&p, e, c); + } + append_char(&p, e, '\n'); + + if (dict_len) { + bool line = true; + + for (i = 0; i < dict_len; i++) { + unsigned char c = dict[i]; + + if (line) { + append_char(&p, e, ' '); + line = false; + } + + if (c == '\0') { + append_char(&p, e, '\n'); + line = true; + continue; + } + + if (c < ' ' || c >= 127 || c == '\\') { + p += scnprintf(p, e - p, "\\x%02x", c); + continue; + } + + append_char(&p, e, c); + } + append_char(&p, e, '\n'); + } + + return p - buf; +} + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -575,19 +650,12 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, { struct devkmsg_user *user = file->private_data; struct printk_log *msg; - char *p, *e; - u64 ts_usec; - size_t i; - char cont = '-'; size_t len; ssize_t ret; if (!user) return -EBADF; - p = user->buf; - e = user->buf + sizeof(user->buf); - ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; @@ -617,71 +685,17 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, } msg = log_from_idx(user->idx); - ts_usec = msg->ts_nsec; - do_div(ts_usec, 1000); + len = msg_print_ext_header(user->buf, sizeof(user->buf), + msg, user->seq, user->prev); + len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, + log_dict(msg), msg->dict_len, + log_text(msg), msg->text_len); - /* - * If we couldn't merge continuation line fragments during the print, - * export the stored flags to allow an optional external merge of the - * records. Merging the records isn't always neccessarily correct, like - * when we hit a race during printing. In most cases though, it produces - * better readable output. 'c' in the record flags mark the first - * fragment of a line, '+' the following. - */ - if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) - cont = 'c'; - else if ((msg->flags & LOG_CONT) || - ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) - cont = '+'; - - p += scnprintf(p, e - p, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, - user->seq, ts_usec, cont); user->prev = msg->flags; - - /* escape non-printable characters */ - for (i = 0; i < msg->text_len; i++) { - unsigned char c = log_text(msg)[i]; - - if (c < ' ' || c >= 127 || c == '\\') - p += scnprintf(p, e - p, "\\x%02x", c); - else - append_char(&p, e, c); - } - append_char(&p, e, '\n'); - - if (msg->dict_len) { - bool line = true; - - for (i = 0; i < msg->dict_len; i++) { - unsigned char c = log_dict(msg)[i]; - - if (line) { - append_char(&p, e, ' '); - line = false; - } - - if (c == '\0') { - append_char(&p, e, '\n'); - line = true; - continue; - } - - if (c < ' ' || c >= 127 || c == '\\') { - p += scnprintf(p, e - p, "\\x%02x", c); - continue; - } - - append_char(&p, e, c); - } - append_char(&p, e, '\n'); - } - user->idx = log_next(user->idx); user->seq++; raw_spin_unlock_irq(&logbuf_lock); - len = p - user->buf; if (len > count) { ret = -EINVAL; goto out; -- cgit v1.2.3 From 6fe29354befe4c46eb308b662155d4d8017358e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Jun 2015 15:01:30 -0700 Subject: printk: implement support for extended console drivers printk log_buf keeps various metadata for each message including its sequence number and timestamp. The metadata is currently available only through /dev/kmsg and stripped out before passed onto console drivers. We want this metadata to be available to console drivers too so that console consumers can get full information including the metadata and dictionary, which among other things can be used to detect whether messages got lost in transit. This patch implements support for extended console drivers. Consoles can indicate that they want extended messages by setting the new CON_EXTENDED flag and they'll be fed messages formatted the same way as /dev/kmsg. ",,,;\n" If extended consoles exist, in-kernel fragment assembly is disabled. This ensures that all messages emitted to consoles have full metadata including sequence number. The contflag carries enough information to reassemble the fragments from the reader side trivially. Note that this only affects /dev/kmsg. Regular console and /proc/kmsg outputs are not affected by this change. * Extended message formatting for console drivers is enabled iff there are registered extended consoles. * Comment describing /dev/kmsg message format updated to add missing contflag field and help distinguishing variable from verbatim terms. Signed-off-by: Tejun Heo Cc: David Miller Cc: Kay Sievers Reviewed-by: Petr Mladek Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 66 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 51ce4f1838b3..ae980dc3ac1e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -84,6 +84,18 @@ static struct lockdep_map console_lock_dep_map = { }; #endif +/* + * Number of registered extended console drivers. + * + * If extended consoles are present, in-kernel cont reassembly is disabled + * and each fragment is stored as a separate log entry with proper + * continuation flag so that every emitted message has full metadata. This + * doesn't change the result for regular consoles or /proc/kmsg. For + * /dev/kmsg, as long as the reader concatenates messages according to + * consecutive continuation flags, the end result should be the same too. + */ +static int nr_ext_console_drivers; + /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -195,7 +207,7 @@ static int console_may_schedule; * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: - * "level,sequnum,timestamp;\n" + * ",,,;\n" * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible @@ -1417,7 +1429,9 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. */ -static void call_console_drivers(int level, const char *text, size_t len) +static void call_console_drivers(int level, + const char *ext_text, size_t ext_len, + const char *text, size_t len) { struct console *con; @@ -1438,7 +1452,10 @@ static void call_console_drivers(int level, const char *text, size_t len) if (!cpu_online(smp_processor_id()) && !(con->flags & CON_ANYTIME)) continue; - con->write(con, text, len); + if (con->flags & CON_EXTENDED) + con->write(con, ext_text, ext_len); + else + con->write(con, text, len); } } @@ -1581,8 +1598,12 @@ static bool cont_add(int facility, int level, const char *text, size_t len) if (cont.len && cont.flushed) return false; - if (cont.len + len > sizeof(cont.buf)) { - /* the line gets too long, split it up in separate records */ + /* + * If ext consoles are present, flush and skip in-kernel + * continuation. See nr_ext_console_drivers definition. Also, if + * the line gets too long, split it up in separate records. + */ + if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { cont_flush(LOG_CONT); return false; } @@ -1917,9 +1938,19 @@ static struct cont { u8 level; bool flushed:1; } cont; +static char *log_text(const struct printk_log *msg) { return NULL; } +static char *log_dict(const struct printk_log *msg) { return NULL; } static struct printk_log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } -static void call_console_drivers(int level, const char *text, size_t len) {} +static ssize_t msg_print_ext_header(char *buf, size_t size, + struct printk_log *msg, u64 seq, + enum log_flags prev_flags) { return 0; } +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *dict, size_t dict_len, + char *text, size_t text_len) { return 0; } +static void call_console_drivers(int level, + const char *ext_text, size_t ext_len, + const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } @@ -2172,7 +2203,7 @@ static void console_cont_flush(char *text, size_t size) len = cont_print_text(text, size); raw_spin_unlock(&logbuf_lock); stop_critical_timings(); - call_console_drivers(cont.level, text, len); + call_console_drivers(cont.level, NULL, 0, text, len); start_critical_timings(); local_irq_restore(flags); return; @@ -2196,6 +2227,7 @@ out: */ void console_unlock(void) { + static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; static u64 seen_seq; unsigned long flags; @@ -2214,6 +2246,7 @@ void console_unlock(void) again: for (;;) { struct printk_log *msg; + size_t ext_len = 0; size_t len; int level; @@ -2259,13 +2292,22 @@ skip: level = msg->level; len += msg_print_text(msg, console_prev, false, text + len, sizeof(text) - len); + if (nr_ext_console_drivers) { + ext_len = msg_print_ext_header(ext_text, + sizeof(ext_text), + msg, console_seq, console_prev); + ext_len += msg_print_ext_body(ext_text + ext_len, + sizeof(ext_text) - ext_len, + log_dict(msg), msg->dict_len, + log_text(msg), msg->text_len); + } console_idx = log_next(console_idx); console_seq++; console_prev = msg->flags; raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(level, text, len); + call_console_drivers(level, ext_text, ext_len, text, len); start_critical_timings(); local_irq_restore(flags); } @@ -2521,6 +2563,11 @@ void register_console(struct console *newcon) newcon->next = console_drivers->next; console_drivers->next = newcon; } + + if (newcon->flags & CON_EXTENDED) + if (!nr_ext_console_drivers++) + pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n"); + if (newcon->flags & CON_PRINTBUFFER) { /* * console_unlock(); will print out the buffered messages @@ -2593,6 +2640,9 @@ int unregister_console(struct console *console) } } + if (!res && (console->flags & CON_EXTENDED)) + nr_ext_console_drivers--; + /* * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. -- cgit v1.2.3 From d194e5d666225b04c7754471df0948f645b6ab3a Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 25 Jun 2015 15:01:44 -0700 Subject: security_syslog() should be called once only The final version of commit 637241a900cb ("kmsg: honor dmesg_restrict sysctl on /dev/kmsg") lost few hooks, as result security_syslog() are processed incorrectly: - open of /dev/kmsg checks syslog access permissions by using check_syslog_permissions() where security_syslog() is not called if dmesg_restrict is set. - syslog syscall and /proc/kmsg calls do_syslog() where security_syslog can be executed twice (inside check_syslog_permissions() and then directly in do_syslog()) With this patch security_syslog() is called once only in all syslog-related operations regardless of dmesg_restrict value. Fixes: 637241a900cb ("kmsg: honor dmesg_restrict sysctl on /dev/kmsg") Signed-off-by: Vasily Averin Cc: Kees Cook Cc: Josh Boyer Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ae980dc3ac1e..45fa8c88ac47 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -496,11 +496,11 @@ int check_syslog_permissions(int type, bool from_file) * already done the capabilities checks at open time. */ if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; + goto ok; if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) - return 0; + goto ok; /* * For historical reasons, accept CAP_SYS_ADMIN too, with * a warning. @@ -510,10 +510,11 @@ int check_syslog_permissions(int type, bool from_file) "CAP_SYS_ADMIN but no CAP_SYSLOG " "(deprecated).\n", current->comm, task_pid_nr(current)); - return 0; + goto ok; } return -EPERM; } +ok: return security_syslog(type); } @@ -1299,10 +1300,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (error) goto out; - error = security_syslog(type); - if (error) - return error; - switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; -- cgit v1.2.3 From 3ea4331c60be3eee4c97e5ddabad95399f879b76 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 25 Jun 2015 15:01:47 -0700 Subject: check_syslog_permissions() cleanup Patch fixes drawbacks in heck_syslog_permissions() noticed by AKPM: "from_file handling makes me cry. That's not a boolean - it's an enumerated value with two values currently defined. But the code in check_syslog_permissions() treats it as a boolean and also hardwires the knowledge that SYSLOG_FROM_PROC == 1 (or == `true`). And the name is wrong: it should be called from_proc to match SYSLOG_FROM_PROC." Signed-off-by: Vasily Averin Cc: Kees Cook Cc: Josh Boyer Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 45fa8c88ac47..de553849f3ac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -489,13 +489,13 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -int check_syslog_permissions(int type, bool from_file) +int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ - if (from_file && type != SYSLOG_ACTION_OPEN) + if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { @@ -1290,13 +1290,13 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return len; } -int do_syslog(int type, char __user *buf, int len, bool from_file) +int do_syslog(int type, char __user *buf, int len, int source) { bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; - error = check_syslog_permissions(type, from_file); + error = check_syslog_permissions(type, source); if (error) goto out; @@ -1379,7 +1379,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) syslog_prev = 0; syslog_partial = 0; } - if (from_file) { + if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of -- cgit v1.2.3 From 1bb564718f298dfd7c435819d3bc902e6be666c6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 25 Jun 2015 15:02:25 -0700 Subject: kernel/trace/trace_events_filter.c: use strreplace() There's no point in starting over every time we see a ','... Signed-off-by: Rasmus Villemoes Acked-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_events_filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7f2e97ce71a7..9d4a78f45dc4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2082,7 +2082,7 @@ struct function_filter_data { static char ** ftrace_function_filter_re(char *buf, int len, int *count) { - char *str, *sep, **re; + char *str, **re; str = kstrndup(buf, len, GFP_KERNEL); if (!str) @@ -2092,8 +2092,7 @@ ftrace_function_filter_re(char *buf, int len, int *count) * The argv_split function takes white space * as a separator, so convert ',' into spaces. */ - while ((sep = strchr(str, ','))) - *sep = ' '; + strreplace(str, ',', ' '); re = argv_split(GFP_KERNEL, str, count); kfree(str); -- cgit v1.2.3 From ff14417c0a00c9a906b4ba79fbecb79bd2435207 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 25 Jun 2015 15:02:28 -0700 Subject: kernel/trace/blktrace.c: use strreplace() in do_blk_trace_setup() Part of the disassembly of do_blk_trace_setup: 231b: e8 00 00 00 00 callq 2320 231c: R_X86_64_PC32 strlen+0xfffffffffffffffc 2320: eb 0a jmp 232c 2322: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 2328: 48 83 c3 01 add $0x1,%rbx 232c: 48 39 d8 cmp %rbx,%rax 232f: 76 47 jbe 2378 2331: 41 80 3c 1c 2f cmpb $0x2f,(%r12,%rbx,1) 2336: 75 f0 jne 2328 2338: 41 c6 04 1c 5f movb $0x5f,(%r12,%rbx,1) 233d: 4c 89 e7 mov %r12,%rdi 2340: e8 00 00 00 00 callq 2345 2341: R_X86_64_PC32 strlen+0xfffffffffffffffc 2345: eb e1 jmp 2328 Yep, that's right: gcc isn't smart enough to realize that replacing '/' by '_' cannot change the strlen(), so we call it again and again (at least when a '/' is found). Even if gcc were that smart, this construction would still loop over the string twice, once for the initial strlen() call and then the open-coded loop. Let's simply use strreplace() instead. Signed-off-by: Rasmus Villemoes Acked-by: Steven Rostedt Liked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/blktrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 483cecfa5c17..4eeae4674b5a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -439,7 +439,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, { struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; - int ret, i; + int ret; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -451,9 +451,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * some device names have larger paths - convert the slashes * to underscores for this to work as expected */ - for (i = 0; i < strlen(buts->name); i++) - if (buts->name[i] == '/') - buts->name[i] = '_'; + strreplace(buts->name, '/', '_'); bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) -- cgit v1.2.3 From 51229b495340bd7a02ce3622d1966829b67054ea Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 25 Jun 2015 15:03:56 -0700 Subject: exit,stats: /* obey this comment */ There is a helpful comment in do_exit() that states we sync the mm's RSS info before statistics gathering. The function that does the statistics gathering is called right above that comment. Change the code to obey the comment. Signed-off-by: Rik van Riel Cc: Oleg Nesterov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 185752a729f6..031325e9acf9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -711,10 +711,10 @@ void do_exit(long code) current->comm, task_pid_nr(current), preempt_count()); - acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); -- cgit v1.2.3