From b7522056ec1a62a5f3246627e12ef48e6274c21c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 13 Jan 2016 18:39:58 +0300 Subject: ftrace: Remove unused nr_trampolines var It's not needed & not used since introducing old_hash: commit fef5aeeee9e371 ("ftrace: Replace tramp_hash with old_*_hash to save space"). Link: http://lkml.kernel.org/r/1452699598-27610-1-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 60048c50404e..65460ad93c7b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -164,7 +164,6 @@ struct ftrace_ops { ftrace_func_t saved_func; int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE - int nr_trampolines; struct ftrace_ops_hash local_hash; struct ftrace_ops_hash *func_hash; struct ftrace_ops_hash old_hash; -- cgit v1.2.3 From 203cbf77de59fc8f13502dcfd11350c6d4a5c95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:46 +0000 Subject: hrtimer: Handle remaining time proper for TIME_LOW_RES If CONFIG_TIME_LOW_RES is enabled we add a jiffie to the relative timeout to prevent short sleeps, but we do not account for that in interfaces which retrieve the remaining time. Helge observed that timerfd can return a remaining time larger than the relative timeout. That's not expected and breaks userland test programs. Store the information that the timer was armed relative and provide functions to adjust the remaining time. To avoid bloating the hrtimer struct make state a u8, which as a bonus results in better code on x86 at least. Reported-and-tested-by: Helge Deller Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.273328486@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..2ead22dd74a0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -87,7 +87,8 @@ enum hrtimer_restart { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @start_pid: timer statistics field to store the pid of the task which + * @is_rel: Set if the timer was armed relative + * @start_pid: timer statistics field to store the pid of the task which * started the timer * @start_site: timer statistics field to store the site where the timer * was started @@ -101,7 +102,8 @@ struct hrtimer { ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; - unsigned long state; + u8 state; + u8 is_rel; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -321,6 +323,27 @@ static inline void clock_was_set_delayed(void) { } #endif +static inline ktime_t +__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) +{ + ktime_t rem = ktime_sub(timer->node.expires, now); + + /* + * Adjust relative timers for the extra we added in + * hrtimer_start_range_ns() to prevent short timeouts. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) + rem.tv64 -= hrtimer_resolution; + return rem; +} + +static inline ktime_t +hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) +{ + return __hrtimer_expires_remaining_adjusted(timer, + timer->base->get_time()); +} + extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); @@ -390,7 +413,12 @@ static inline void hrtimer_restart(struct hrtimer *timer) } /* Query timers: */ -extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); +extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); + +static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +{ + return __hrtimer_get_remaining(timer, false); +} extern u64 hrtimer_get_next_event(void); -- cgit v1.2.3 From b4ace4f1ae07691b4f6ea9f3e92efbec083df058 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 19 Jan 2016 14:27:08 -0500 Subject: soreuseport: fix NULL ptr dereference SO_REUSEPORT after bind Marc Dionne discovered a NULL pointer dereference when setting SO_REUSEPORT on a socket after it is bound. This patch removes the assumption that at least one socket in the reuseport group is bound with the SO_REUSEPORT option before other bind calls occur. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Reported-by: Marc Dionne Signed-off-by: Craig Gallek Tested-by: Marc Dionne Signed-off-by: David S. Miller --- include/net/sock_reuseport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 7dda3d7adba8..aecd30308d50 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -16,7 +16,7 @@ struct sock_reuseport { }; extern int reuseport_alloc(struct sock *sk); -extern int reuseport_add_sock(struct sock *sk, const struct sock *sk2); +extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); extern void reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, -- cgit v1.2.3 From c240906d36653944d5c049df7ce667a7e8bea6ac Mon Sep 17 00:00:00 2001 From: John Keeping Date: Tue, 19 Jan 2016 10:46:58 +0000 Subject: drm/atomic-helper: Export framebuffer_changed() The Rockchip driver cannot use drm_atomic_helper_wait_for_vblanks() because it has hardware counters for neither vblanks nor scanlines. In order to simplify re-implementing the functionality for this driver, export the framebuffer_changed() helper so it can be reused. Signed-off-by: John Keeping Reviewed-by: Daniel Vetter --- include/drm/drm_atomic_helper.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h index 89d008dc08e2..fe5efada9d68 100644 --- a/include/drm/drm_atomic_helper.h +++ b/include/drm/drm_atomic_helper.h @@ -42,6 +42,10 @@ int drm_atomic_helper_commit(struct drm_device *dev, struct drm_atomic_state *state, bool async); +bool drm_atomic_helper_framebuffer_changed(struct drm_device *dev, + struct drm_atomic_state *old_state, + struct drm_crtc *crtc); + void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev, struct drm_atomic_state *old_state); -- cgit v1.2.3 From b16c29191dc89bd877af99a7b04ce4866728a3e0 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 18 Jan 2016 19:23:51 -0500 Subject: netfilter: nf_conntrack: use safer way to lock all buckets When we need to lock all buckets in the connection hashtable we'd attempt to lock 1024 spinlocks, which is way more preemption levels than supported by the kernel. Furthermore, this behavior was hidden by checking if lockdep is enabled, and if it was - use only 8 buckets(!). Fix this by using a global lock and synchronize all buckets on it when we need to lock them all. This is pretty heavyweight, but is only done when we need to resize the hashtable, and that doesn't happen often enough (or at all). Signed-off-by: Sasha Levin Acked-by: Jesper Dangaard Brouer Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 788ef58a66b9..62e17d1319ff 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -79,12 +79,10 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *proto); -#ifdef CONFIG_LOCKDEP -# define CONNTRACK_LOCKS 8 -#else -# define CONNTRACK_LOCKS 1024 -#endif +#define CONNTRACK_LOCKS 1024 + extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; +void nf_conntrack_lock(spinlock_t *lock); extern spinlock_t nf_conntrack_expect_lock; -- cgit v1.2.3 From 386744425e35e04984c6e741c7750fd6eef1a9df Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 1 Jul 2015 14:17:58 +0200 Subject: swiotlb: Make linux/swiotlb.h standalone includible This header file uses the enum dma_data_direction and struct page types without explicitly including the corresponding header files. This makes it rely on the includer to have included the proper headers before. To fix this, include linux/dma-direction.h and forward-declare struct page. The swiotlb_free() function is also annotated __init, therefore requires linux/init.h to be included as well. Signed-off-by: Thierry Reding Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index e7a018eaf3a2..017fced60242 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -1,10 +1,13 @@ #ifndef __LINUX_SWIOTLB_H #define __LINUX_SWIOTLB_H +#include +#include #include struct device; struct dma_attrs; +struct page; struct scatterlist; extern int swiotlb_force; -- cgit v1.2.3 From ce87fc6ce3f9f4488546187e3757cf666d9d4a2a Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Jan 2016 17:59:49 -0800 Subject: gro: Make GRO aware of lightweight tunnels. GRO is currently not aware of tunnel metadata generated by lightweight tunnels and stored in the dst. This leads to two possible problems: * Incorrectly merging two frames that have different metadata. * Leaking of allocated metadata from merged frames. This avoids those problems by comparing the tunnel information before merging, similar to how we handle other metadata (such as vlan tags), and releasing any state when we are done. Reported-by: John Fixes: 2e15ea39 ("ip_gre: Add support to collect tunnel metadata.") Signed-off-by: Jesse Gross Acked-by: Eric Dumazet Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 6816f0fa5693..30a56ab2ccfb 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -44,6 +44,24 @@ static inline bool skb_valid_dst(const struct sk_buff *skb) return dst && !(dst->flags & DST_METADATA); } +static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, + const struct sk_buff *skb_b) +{ + const struct metadata_dst *a, *b; + + if (!(skb_a->_skb_refdst | skb_b->_skb_refdst)) + return 0; + + a = (const struct metadata_dst *) skb_dst(skb_a); + b = (const struct metadata_dst *) skb_dst(skb_b); + + if (!a != !b || a->u.tun_info.options_len != b->u.tun_info.options_len) + return 1; + + return memcmp(&a->u.tun_info, &b->u.tun_info, + sizeof(a->u.tun_info) + a->u.tun_info.options_len); +} + struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); -- cgit v1.2.3 From fae3fde65138b6071b1b0e0b567d4058a8b6a88c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jan 2016 15:00:50 +0100 Subject: perf: Collapse and fix event_function_call() users There is one common bug left in all the event_function_call() users, between loading ctx->task and getting to the remote_function(), ctx->task can already have been changed. Therefore we need to double check and retry if ctx->task != current. Insert another trampoline specific to event_function_call() that checks for this and further validates state. This also allows getting rid of the active/inactive functions. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9828a48f16a..6612732d8fd0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1044,7 +1044,7 @@ extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); -extern int __perf_event_disable(void *info); +extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * -- cgit v1.2.3 From 4877be9019baaf1432f9117bff4873e4ad518d91 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 21 Jan 2016 15:56:28 -0500 Subject: net: sock: remove dead cgroup methods from struct proto The cgroup methods are no longer used after baac50bbc3cd ("net: tcp_memcontrol: simplify linkage between socket and page counter"). The hunk to delete them was included in the original patch but must have gotten lost during conflict resolution on the way upstream. Fixes: baac50bbc3cd ("net: tcp_memcontrol: simplify linkage between socket and page counter") Signed-off-by: Johannes Weiner Signed-off-by: David S. Miller --- include/net/sock.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index b9e7b3d863a0..f5ea148853e2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1035,18 +1035,6 @@ struct proto { struct list_head node; #ifdef SOCK_REFCNT_DEBUG atomic_t socks; -#endif -#ifdef CONFIG_MEMCG_KMEM - /* - * cgroup specific init/deinit functions. Called once for all - * protocols that implement it, from cgroups populate function. - * This function has to setup any files the protocol want to - * appear in the kmem cgroup filesystem. - */ - int (*init_cgroup)(struct mem_cgroup *memcg, - struct cgroup_subsys *ss); - void (*destroy_cgroup)(struct mem_cgroup *memcg); - struct cg_proto *(*proto_cgroup)(struct mem_cgroup *memcg); #endif int (*diag_destroy)(struct sock *sk, int err); }; -- cgit v1.2.3 From 6a84f57241e1fb9fb6772256f538d1073359a32d Mon Sep 17 00:00:00 2001 From: Gayatri Kammela Date: Thu, 21 Jan 2016 14:02:39 -0800 Subject: raid6/algos.c : bug fix : Add the missing definitions to the pq.h file Adding these pr_info and pr_err definitions so as to allow code to be compiled successfully for testing in userspace, since the printk has been replaced by pr_info and pr_err in algos.c Absence of these definitions result in the compilation errors such as ' undefined reference to `pr_info' ' ' undefined reference to `pr_err' ' Cc: NeilBrown Cc: Anton Blanchard Cc: Fenghua Yu Signed-off-by: Gayatri Kammela Signed-off-by: Shaohua Li --- include/linux/raid/pq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index a7a06d1dcf9c..a0118d5929a9 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -152,6 +152,8 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, # define jiffies raid6_jiffies() # define printk printf +# define pr_err(format, ...) fprintf(stderr, format, ## __VA_ARGS__) +# define pr_info(format, ...) fprintf(stdout, format, ## __VA_ARGS__) # define GFP_KERNEL 0 # define __get_free_pages(x, y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), \ PROT_READ|PROT_WRITE, \ -- cgit v1.2.3 From e93ad19d05648397ef3bcb838d26aec06c245dc0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Jan 2016 12:18:41 -0500 Subject: cpuset: make mm migration asynchronous If "cpuset.memory_migrate" is set, when a process is moved from one cpuset to another with a different memory node mask, pages in used by the process are migrated to the new set of nodes. This was performed synchronously in the ->attach() callback, which is synchronized against process management. Recently, the synchronization was changed from per-process rwsem to global percpu rwsem for simplicity and optimization. Combined with the synchronous mm migration, this led to deadlocks because mm migration could schedule a work item which may in turn try to create a new worker blocking on the process management lock held from cgroup process migration path. This heavy an operation shouldn't be performed synchronously from that deep inside cgroup migration in the first place. This patch punts the actual migration to an ordered workqueue and updates cgroup process migration and cpuset config update paths to flush the workqueue after all locks are released. This way, the operations still seem synchronous to userland without entangling mm migration with process management synchronization. CPU hotplug can also invoke mm migration but there's no reason for it to wait for mm migrations and thus doesn't synchronize against their completions. Signed-off-by: Tejun Heo Reported-and-tested-by: Christian Borntraeger Cc: stable@vger.kernel.org # v4.4+ --- include/linux/cpuset.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 85a868ccb493..fea160ee5803 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } +extern void cpuset_post_attach_flush(void); + #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } +static inline void cpuset_post_attach_flush(void) +{ +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ -- cgit v1.2.3 From aa226ff4a1ce79f229c6b7a4c0a14e17fececd01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Jan 2016 15:31:11 -0500 Subject: cgroup: make sure a parent css isn't offlined before its children There are three subsystem callbacks in css shutdown path - css_offline(), css_released() and css_free(). Except for css_released(), cgroup core didn't guarantee the order of invocation. css_offline() or css_free() could be called on a parent css before its children. This behavior is unexpected and led to bugs in cpu and memory controller. This patch updates offline path so that a parent css is never offlined before its children. Each css keeps online_cnt which reaches zero iff itself and all its children are offline and offline_css() is invoked only after online_cnt reaches zero. This fixes the memory controller bug and allows the fix for cpu controller. Signed-off-by: Tejun Heo Reported-and-tested-by: Christian Borntraeger Reported-by: Brian Christiansen Link: http://lkml.kernel.org/g/5698A023.9070703@de.ibm.com Link: http://lkml.kernel.org/g/CAKB58ikDkzc8REt31WBkD99+hxNzjK4+FBmhkgS+NVrC9vjMSg@mail.gmail.com Cc: Heiko Carstens Cc: Peter Zijlstra Cc: stable@vger.kernel.org --- include/linux/cgroup-defs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7f540f7f588d..789471dba6fb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -127,6 +127,12 @@ struct cgroup_subsys_state { */ u64 serial_nr; + /* + * Incremented by online self and children. Used to guarantee that + * parents are not offlined before their children. + */ + atomic_t online_cnt; + /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; -- cgit v1.2.3 From facc432faa59414bd7c60c307ff1645154a66c98 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Jan 2016 11:43:44 +0100 Subject: net: simplify napi_synchronize() to avoid warnings The napi_synchronize() function is defined twice: The definition for SMP builds waits for other CPUs to be done, while the uniprocessor variant just contains a barrier and ignores its argument. In the mvneta driver, this leads to a warning about an unused variable when we lookup the NAPI struct of another CPU and then don't use it: ethernet/marvell/mvneta.c: In function 'mvneta_percpu_notifier': ethernet/marvell/mvneta.c:2910:30: error: unused variable 'other_port' [-Werror=unused-variable] There are no other CPUs on a UP build, so that code never runs, but gcc does not know this. The nicest solution seems to be to turn the napi_synchronize() helper into an inline function for the UP case as well, as that leads gcc to not complain about the argument being unused. Once we do that, we can also combine the two cases into a single function definition and use if(IS_ENABLED()) rather than #ifdef to make it look a bit nicer. The warning first came up in linux-4.4, but I failed to catch it earlier. Signed-off-by: Arnd Bergmann Fixes: f86428854480 ("net: mvneta: Statically assign queues to CPUs") Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ac140dcb789..289c2314d766 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -512,7 +512,6 @@ static inline void napi_enable(struct napi_struct *n) clear_bit(NAPI_STATE_NPSVC, &n->state); } -#ifdef CONFIG_SMP /** * napi_synchronize - wait until NAPI is not running * @n: napi context @@ -523,12 +522,12 @@ static inline void napi_enable(struct napi_struct *n) */ static inline void napi_synchronize(const struct napi_struct *n) { - while (test_bit(NAPI_STATE_SCHED, &n->state)) - msleep(1); + if (IS_ENABLED(CONFIG_SMP)) + while (test_bit(NAPI_STATE_SCHED, &n->state)) + msleep(1); + else + barrier(); } -#else -# define napi_synchronize(n) barrier() -#endif enum netdev_queue_state_t { __QUEUE_STATE_DRV_XOFF, -- cgit v1.2.3 From 71f50c6d9a2276f3ec85384bffe2aee1962f4669 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 Jan 2016 01:33:51 +0900 Subject: of: drop symbols declared by _OF_DECLARE() from modules The users of this macro (OF_EARLYCON_DECLARE, CLK_OF_DECLARE, IRQCHIP_DECLARE, etc.) are only parsed in the early boot stage. Such symbols contained in modules are never used. This commit fixes the link error introduced by commit b8d20e06eaad ("serial: 8250_uniphier: add earlycon support"); the combination of CONFIG_SERIAL_8250_UNIPHIER=m and CONFIG_SERIAL_8250_CONSOLE=y fails to link: ERROR: "early_serial8250_setup" [drivers/tty/serial/8250/8250_uniphier.ko] undefined! Fixes: b8d20e06eaad ("serial: 8250_uniphier: add earlycon support") Signed-off-by: Masahiro Yamada Acked-by: Arnd Bergmann Signed-off-by: Rob Herring --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index dd10626a615f..dc6e39696b64 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -929,7 +929,7 @@ static inline int of_get_available_child_count(const struct device_node *np) return num; } -#ifdef CONFIG_OF +#if defined(CONFIG_OF) && !defined(MODULE) #define _OF_DECLARE(table, name, compat, fn, fn_type) \ static const struct of_device_id __of_table_##name \ __used __section(__##table##_of_table) \ -- cgit v1.2.3 From 7fd13615992ae0fc132c9abb24511be43f3b5d9d Mon Sep 17 00:00:00 2001 From: Gustavo Padovan Date: Thu, 21 Jan 2016 09:48:14 -0200 Subject: tracing/dma-buf/fence: Fix timeline str value on fence_annotate_wait_on timeline was wrongly assigned with ->get_driver_name(). Link: http://lkml.kernel.org/r/1453376895-30747-1-git-send-email-gustavo@padovan.org Signed-off-by: Gustavo Padovan Signed-off-by: Steven Rostedt --- include/trace/events/fence.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/fence.h b/include/trace/events/fence.h index 98feb1b82896..d6dfa05ba322 100644 --- a/include/trace/events/fence.h +++ b/include/trace/events/fence.h @@ -17,7 +17,7 @@ TRACE_EVENT(fence_annotate_wait_on, TP_STRUCT__entry( __string(driver, fence->ops->get_driver_name(fence)) - __string(timeline, fence->ops->get_driver_name(fence)) + __string(timeline, fence->ops->get_timeline_name(fence)) __field(unsigned int, context) __field(unsigned int, seqno) -- cgit v1.2.3 From fb3296335500aaff61333df8eabbccf28761c79d Mon Sep 17 00:00:00 2001 From: Danesh Petigara Date: Mon, 11 Jan 2016 13:22:26 -0800 Subject: drivers: ata: wake port before DMA stop for ALPM The AHCI driver code stops and starts port DMA engines at will without considering the power state of the particular port. The AHCI specification isn't very clear on how to handle this scenario, leaving implementation open to interpretation. Broadcom's STB SATA host controller is unable to handle port DMA controller restarts when the port in question is in low power mode. When a port enters partial or slumber mode, its PHY is powered down. When a controller restart is requested, the controller's internal state machine expects the PHY to be brought back up by software which never happens in this case, resulting in failures. To avoid this situation, logic is added to manually wake up the port just before its DMA engine is stopped, if the port happens to be in a low power state. HBA initiated power management ensures that the port eventually returns to its configured low power state, when the link is idle (as per the conditions listed in the spec). A new host flag is also added to ensure this logic is only exercised for hosts with the above limitation. tj: Formatting changes. Signed-off-by: Danesh Petigara Reviewed-by: Markus Mayer Signed-off-by: Tejun Heo --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 851821bfd553..bec2abbd7ab2 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -526,6 +526,7 @@ enum ata_lpm_policy { enum ata_lpm_hints { ATA_LPM_EMPTY = (1 << 0), /* port empty/probing */ ATA_LPM_HIPM = (1 << 1), /* may use HIPM */ + ATA_LPM_WAKE_ONLY = (1 << 2), /* only wake up link */ }; /* forward declarations */ -- cgit v1.2.3 From 530cbe100ef7587aa5b5ac3a4b670cda4d50e598 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Jan 2016 13:52:25 +0000 Subject: irqdomain: Allow domain lookup with DOMAIN_BUS_WIRED token Let's take the (outlandish) example of an interrupt controller capable of handling both wired interrupts and PCI MSIs. With the current code, the PCI MSI domain is going to be tagged with DOMAIN_BUS_PCI_MSI, and the wired domain with DOMAIN_BUS_ANY. Things get hairy when we start looking up the domain for a wired interrupt (typically when creating it based on some firmware information - DT or ACPI). In irq_create_fwspec_mapping(), we perform the lookup using DOMAIN_BUS_ANY, which is actually used as a wildcard. This gives us one chance out of two to end up with the wrong domain, and we try to configure a wired interrupt with the MSI domain. Everything grinds to a halt pretty quickly. What we really need to do is to start looking for a domain that would uniquely identify a wired interrupt domain, and only use DOMAIN_BUS_ANY as a fallback. In order to solve this, let's introduce a new DOMAIN_BUS_WIRED token, which is going to be used exactly as described above. Of course, this depends on the irqchip to setup the domain bus_token, and nobody had to implement this so far. Only so far. Signed-off-by: Marc Zyngier Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Frank Rowand Cc: Grant Likely Cc: Thomas Petazzoni Cc: Jiang Liu Link: http://lkml.kernel.org/r/1453816347-32720-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f64622ad02c1..04579d9fbce4 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -70,6 +70,7 @@ struct irq_fwspec { */ enum irq_domain_bus_token { DOMAIN_BUS_ANY = 0, + DOMAIN_BUS_WIRED, DOMAIN_BUS_PCI_MSI, DOMAIN_BUS_PLATFORM_MSI, DOMAIN_BUS_NEXUS, -- cgit v1.2.3 From 602eb48966d7b7f7e64dca8d9ea2842d83bfae73 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 24 Jan 2016 17:36:04 +0000 Subject: drm/etnaviv: add further minor features and varyings count Export further minor feature bitmasks and the varyings count from the GPU specifications registers to userspace. Acked-by: Christian Gmeiner Signed-off-by: Russell King Signed-off-by: Lucas Stach --- include/uapi/drm/etnaviv_drm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/etnaviv_drm.h b/include/uapi/drm/etnaviv_drm.h index 4cc989ad6851..f95e1c43c3fb 100644 --- a/include/uapi/drm/etnaviv_drm.h +++ b/include/uapi/drm/etnaviv_drm.h @@ -48,6 +48,8 @@ struct drm_etnaviv_timespec { #define ETNAVIV_PARAM_GPU_FEATURES_2 0x05 #define ETNAVIV_PARAM_GPU_FEATURES_3 0x06 #define ETNAVIV_PARAM_GPU_FEATURES_4 0x07 +#define ETNAVIV_PARAM_GPU_FEATURES_5 0x08 +#define ETNAVIV_PARAM_GPU_FEATURES_6 0x09 #define ETNAVIV_PARAM_GPU_STREAM_COUNT 0x10 #define ETNAVIV_PARAM_GPU_REGISTER_MAX 0x11 @@ -59,6 +61,7 @@ struct drm_etnaviv_timespec { #define ETNAVIV_PARAM_GPU_BUFFER_SIZE 0x17 #define ETNAVIV_PARAM_GPU_INSTRUCTION_COUNT 0x18 #define ETNAVIV_PARAM_GPU_NUM_CONSTANTS 0x19 +#define ETNAVIV_PARAM_GPU_NUM_VARYINGS 0x1a #define ETNA_MAX_PIPES 4 -- cgit v1.2.3 From 0bfd464d3fdd5bb322f9cace4cc47f1796545cf7 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 9 Jan 2016 21:13:44 -0800 Subject: tty: Wait interruptibly for tty lock on reopen Allow a signal to interrupt the wait for a tty reopen; eg., if the tty has starting final close and is waiting for the device to drain. Signed-off-by: Peter Hurley Cc: stable # 4.4 Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/tty.h b/include/linux/tty.h index 2fd8708ea888..d9fb4b043f56 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -649,6 +649,7 @@ extern long vt_compat_ioctl(struct tty_struct *tty, /* tty_mutex.c */ /* functions for preparation of BKL removal */ extern void __lockfunc tty_lock(struct tty_struct *tty); +extern int tty_lock_interruptible(struct tty_struct *tty); extern void __lockfunc tty_unlock(struct tty_struct *tty); extern void __lockfunc tty_lock_slave(struct tty_struct *tty); extern void __lockfunc tty_unlock_slave(struct tty_struct *tty); -- cgit v1.2.3 From b3c6de492b9ea30a8dcc535d4dc2eaaf0bb3f116 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 21 Jan 2016 16:47:29 +0100 Subject: cleancache: constify cleancache_ops structure The cleancache_ops structure is never modified, so declare it as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/cleancache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index bda5ec0b4b4d..cb3e14234502 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -37,7 +37,7 @@ struct cleancache_ops { void (*invalidate_fs)(int); }; -extern int cleancache_register_ops(struct cleancache_ops *ops); +extern int cleancache_register_ops(const struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); -- cgit v1.2.3 From a39bb9a0557a3fc860c3c9d67a7326b0d2f1bd66 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 7 Jan 2016 05:06:13 +0800 Subject: include/linux/cleancache.h: Clean up code Let cleancache_fs_enabled() call cleancache_fs_enabled_mapping() directly. Remove redundant variable ret in cleancache_get_page(). Signed-off-by: Chen Gang Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/cleancache.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index cb3e14234502..fccf7f44139d 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -48,14 +48,14 @@ extern void __cleancache_invalidate_fs(struct super_block *); #ifdef CONFIG_CLEANCACHE #define cleancache_enabled (1) -static inline bool cleancache_fs_enabled(struct page *page) -{ - return page->mapping->host->i_sb->cleancache_poolid >= 0; -} static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) { return mapping->host->i_sb->cleancache_poolid >= 0; } +static inline bool cleancache_fs_enabled(struct page *page) +{ + return cleancache_fs_enabled_mapping(page->mapping); +} #else #define cleancache_enabled (0) #define cleancache_fs_enabled(_page) (0) @@ -89,11 +89,9 @@ static inline void cleancache_init_shared_fs(struct super_block *sb) static inline int cleancache_get_page(struct page *page) { - int ret = -1; - if (cleancache_enabled && cleancache_fs_enabled(page)) - ret = __cleancache_get_page(page); - return ret; + return __cleancache_get_page(page); + return -1; } static inline void cleancache_put_page(struct page *page) -- cgit v1.2.3 From 1eed677933b816978abc4e3e18ecae5f254cb9be Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 22 Jan 2016 01:49:07 +0800 Subject: sctp: fix the transport dead race check by using atomic_add_unless on refcnt Now when __sctp_lookup_association is running in BH, it will try to check if t->dead is set, but meanwhile other CPUs may be freeing this transport and this assoc and if it happens that __sctp_lookup_association checked t->dead a bit too early, it may think that the association is still good while it was already freed. So we fix this race by using atomic_add_unless in sctp_transport_hold. After we get one transport from hashtable, we will hold it only when this transport's refcnt is not 0, so that we can make sure t->asoc cannot be freed before we hold the asoc again. Note that sctp association is not freed using RCU so we can't use atomic_add_unless() with it as it may just be too late for that either. Fixes: 4f0087812648 ("sctp: apply rhashtable api to send/recv path") Reported-by: Vlad Yasevich Signed-off-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 20e72129be1c..344da04e2e30 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -955,7 +955,7 @@ void sctp_transport_route(struct sctp_transport *, union sctp_addr *, void sctp_transport_pmtu(struct sctp_transport *, struct sock *sk); void sctp_transport_free(struct sctp_transport *); void sctp_transport_reset_timers(struct sctp_transport *); -void sctp_transport_hold(struct sctp_transport *); +int sctp_transport_hold(struct sctp_transport *); void sctp_transport_put(struct sctp_transport *); void sctp_transport_update_rto(struct sctp_transport *, __u32); void sctp_transport_raise_cwnd(struct sctp_transport *, __u32, __u32); -- cgit v1.2.3 From 47faa1e4c50ec26e6e75dcd1ce53f064bd45f729 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 22 Jan 2016 01:49:09 +0800 Subject: sctp: remove the dead field of sctp_transport After we use refcnt to check if transport is alive, the dead can be removed from sctp_transport. The traversal of transport_addr_list in procfs dump is using list_for_each_entry_rcu, no need to check if it has been freed. sctp_generate_t3_rtx_event and sctp_generate_heartbeat_event is protected by sock lock, it's not necessary to check dead, either. also, the timers are cancelled when sctp_transport_free() is called, that it doesn't wait for refcnt to reach 0 to cancel them. Signed-off-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 344da04e2e30..205630bb5010 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -756,7 +756,6 @@ struct sctp_transport { /* Reference counting. */ atomic_t refcnt; - __u32 dead:1, /* RTO-Pending : A flag used to track if one of the DATA * chunks sent to this address is currently being * used to compute a RTT. If this flag is 0, @@ -766,7 +765,7 @@ struct sctp_transport { * calculation completes (i.e. the DATA chunk * is SACK'd) clear this flag. */ - rto_pending:1, + __u32 rto_pending:1, /* * hb_sent : a flag that signals that we have a pending -- cgit v1.2.3 From e03e7ee34fdd1c3ef494949a75cb8c61c7265fa9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Jan 2016 20:59:49 -0800 Subject: perf/bpf: Convert perf_event_array to use struct file Robustify refcounting. Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160126045947.GA40151@ast-mbp.thefacebook.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6612732d8fd0..4f90434b8d64 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -729,7 +729,7 @@ extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); -extern struct perf_event *perf_event_get(unsigned int fd); +extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); @@ -1070,7 +1070,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } -static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); -- cgit v1.2.3 From c6e5b73242d2d9172ea880483bc4ba7ffca0cfb2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jan 2016 16:07:41 +0200 Subject: perf: Synchronously clean up child events The orphan cleanup workqueue doesn't always catch orphans, for example, if they never schedule after they are orphaned. IOW, the event leak is still very real. It also wouldn't work for kernel counters. Doing it synchonously is a little hairy due to lock inversion issues, but is made to work. Patch based on work by Alexander Shishkin. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f90434b8d64..b35a61a481fa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -634,9 +634,6 @@ struct perf_event_context { int nr_cgroups; /* cgroup evts */ void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; - - struct delayed_work orphans_remove; - bool orphans_remove_sched; }; /* -- cgit v1.2.3 From 114f9f1e038eb23935c20fb54f49f07caaa0546d Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 26 Jan 2016 17:19:09 -0500 Subject: Bluetooth: L2CAP: Introduce proper defines for PSM ranges Having proper defines makes the code a bit readable, it also avoids duplicating hard-coded values since these are also needed when auto-allocating PSM values (in a subsequent patch). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 52899291f401..5ee3c689c863 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -252,6 +252,12 @@ struct l2cap_conn_rsp { #define L2CAP_PSM_3DSP 0x0021 #define L2CAP_PSM_IPSP 0x0023 /* 6LoWPAN */ +#define L2CAP_PSM_DYN_START 0x1001 +#define L2CAP_PSM_DYN_END 0xffff +#define L2CAP_PSM_AUTO_END 0x10ff +#define L2CAP_PSM_LE_DYN_START 0x0080 +#define L2CAP_PSM_LE_DYN_END 0x00ff + /* channel identifier */ #define L2CAP_CID_SIGNALING 0x0001 #define L2CAP_CID_CONN_LESS 0x0002 -- cgit v1.2.3 From 0d9bacb6c8265e7aa75ac28ae9b5d7748065942f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 19 Jan 2016 14:28:48 +0900 Subject: iommu: Update struct iommu_ops comments Update the comments around struct iommu_ops to match current state and fix a few typos while at it. Signed-off-by: Magnus Damm Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f28dff313b07..a5c539fa5d2b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -133,8 +133,9 @@ struct iommu_dm_region { /** * struct iommu_ops - iommu ops and capabilities - * @domain_init: init iommu domain - * @domain_destroy: destroy iommu domain + * @capable: check capability + * @domain_alloc: allocate iommu domain + * @domain_free: free iommu domain * @attach_dev: attach device to an iommu domain * @detach_dev: detach device from an iommu domain * @map: map a physically contiguous memory region to an iommu domain @@ -144,8 +145,15 @@ struct iommu_dm_region { * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping + * @device_group: find iommu group for a particular device * @domain_get_attr: Query domain attributes * @domain_set_attr: Change domain attributes + * @get_dm_regions: Request list of direct mapping requirements for a device + * @put_dm_regions: Free list of direct mapping requirements for a device + * @domain_window_enable: Configure and enable a particular window for a domain + * @domain_window_disable: Disable a particular window for a domain + * @domain_set_windows: Set the number of windows for a domain + * @domain_get_windows: Return the number of windows for a domain * @of_xlate: add OF master IDs to iommu grouping * @pgsize_bitmap: bitmap of supported page sizes * @priv: per-instance data private to the iommu driver @@ -182,9 +190,9 @@ struct iommu_ops { int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr, phys_addr_t paddr, u64 size, int prot); void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr); - /* Set the numer of window per domain */ + /* Set the number of windows per domain */ int (*domain_set_windows)(struct iommu_domain *domain, u32 w_count); - /* Get the numer of window per domain */ + /* Get the number of windows per domain */ u32 (*domain_get_windows)(struct iommu_domain *domain); #ifdef CONFIG_OF_IOMMU -- cgit v1.2.3 From 23d11a58a9a60dcb52c8fc6494efce908b24c295 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Jan 2016 05:59:46 -0500 Subject: workqueue: skip flush dependency checks for legacy workqueues fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue") implemented flush dependency warning which triggers if a PF_MEMALLOC task or WQ_MEM_RECLAIM workqueue tries to flush a !WQ_MEM_RECLAIM workquee. This assumes that workqueues marked with WQ_MEM_RECLAIM sit in memory reclaim path and making it depend on something which may need more memory to make forward progress can lead to deadlocks. Unfortunately, workqueues created with the legacy create*_workqueue() interface always have WQ_MEM_RECLAIM regardless of whether they are depended upon memory reclaim or not. These spurious WQ_MEM_RECLAIM markings cause spurious triggering of the flush dependency checks. WARNING: CPU: 0 PID: 6 at kernel/workqueue.c:2361 check_flush_dependency+0x138/0x144() workqueue: WQ_MEM_RECLAIM deferwq:deferred_probe_work_func is flushing !WQ_MEM_RECLAIM events:lru_add_drain_per_cpu ... Workqueue: deferwq deferred_probe_work_func [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x94/0xd4) [] (dump_stack) from [] (warn_slowpath_common+0x80/0xb0) [] (warn_slowpath_common) from [] (warn_slowpath_fmt+0x30/0x40) [] (warn_slowpath_fmt) from [] (check_flush_dependency+0x138/0x144) [] (check_flush_dependency) from [] (flush_work+0x50/0x15c) [] (flush_work) from [] (lru_add_drain_all+0x130/0x180) [] (lru_add_drain_all) from [] (migrate_prep+0x8/0x10) [] (migrate_prep) from [] (alloc_contig_range+0xd8/0x338) [] (alloc_contig_range) from [] (cma_alloc+0xe0/0x1ac) [] (cma_alloc) from [] (__alloc_from_contiguous+0x38/0xd8) [] (__alloc_from_contiguous) from [] (__dma_alloc+0x240/0x278) [] (__dma_alloc) from [] (arm_dma_alloc+0x54/0x5c) [] (arm_dma_alloc) from [] (dmam_alloc_coherent+0xc0/0xec) [] (dmam_alloc_coherent) from [] (ahci_port_start+0x150/0x1dc) [] (ahci_port_start) from [] (ata_host_start.part.3+0xc8/0x1c8) [] (ata_host_start.part.3) from [] (ata_host_activate+0x50/0x148) [] (ata_host_activate) from [] (ahci_host_activate+0x44/0x114) [] (ahci_host_activate) from [] (ahci_platform_init_host+0x1d8/0x3c8) [] (ahci_platform_init_host) from [] (tegra_ahci_probe+0x448/0x4e8) [] (tegra_ahci_probe) from [] (platform_drv_probe+0x50/0xac) [] (platform_drv_probe) from [] (driver_probe_device+0x214/0x2c0) [] (driver_probe_device) from [] (bus_for_each_drv+0x60/0x94) [] (bus_for_each_drv) from [] (__device_attach+0xb0/0x114) [] (__device_attach) from [] (bus_probe_device+0x84/0x8c) [] (bus_probe_device) from [] (deferred_probe_work_func+0x68/0x98) [] (deferred_probe_work_func) from [] (process_one_work+0x120/0x3f8) [] (process_one_work) from [] (worker_thread+0x38/0x55c) [] (worker_thread) from [] (kthread+0xdc/0xf4) [] (kthread) from [] (ret_from_fork+0x14/0x3c) Fix it by marking workqueues created via create*_workqueue() with __WQ_LEGACY and disabling flush dependency checks on them. Signed-off-by: Tejun Heo Reported-and-tested-by: Thierry Reding Link: http://lkml.kernel.org/g/20160126173843.GA11115@ulmo.nvidia.com Fixes: fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue") --- include/linux/workqueue.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0e32bc71245e..ca73c503b92a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -311,6 +311,7 @@ enum { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ + __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ @@ -411,12 +412,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ - alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ - alloc_workqueue("%s", WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, \ - 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ + WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ - alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name) + alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) extern void destroy_workqueue(struct workqueue_struct *wq); -- cgit v1.2.3 From 21603fc45bdef3696f45d80a1c9d730e06b925c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Wed, 27 Jan 2016 22:54:06 +0100 Subject: tcp: Change reference to experimental CWND RFC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jörg Thalheim Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8ea19977ea53..f6f8f032c73e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -216,7 +216,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* TCP thin-stream limits */ #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */ -/* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */ +/* TCP initial congestion window as per rfc6928 */ #define TCP_INIT_CWND 10 /* Bit Flags for sysctl_tcp_fastopen */ -- cgit v1.2.3 From 8a9ebe717a133ba7bc90b06047f43cc6b8bcb8b3 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 18 Jan 2016 14:09:27 -0600 Subject: target: Fix WRITE_SAME/DISCARD conversion to linux 512b sectors In a couple places we are not converting to/from the Linux block layer 512 bytes sectors. 1. The request queue values and what we do are a mismatch of things: max_discard_sectors - This is in linux block layer 512 byte sectors. We are just copying this to max_unmap_lba_count. discard_granularity - This is in bytes. We are converting it to Linux block layer 512 byte sectors. discard_alignment - This is in bytes. We are just copying this over. The problem is that the core LIO code exports these values in spc_emulate_evpd_b0 and we use them to test request arguments in sbc_execute_unmap, but we never convert to the block size we export to the initiator. If we are not using 512 byte sectors then we are exporting the wrong values or are checks are off. And, for the discard_alignment/bytes case we are just plain messed up. 2. blkdev_issue_discard's start and number of sector arguments are supposed to be in linux block layer 512 byte sectors. We are currently passing in the values we get from the initiator which might be based on some other sector size. There is a similar problem in iblock_execute_write_same where the bio functions want values in 512 byte sectors but we are passing in what we got from the initiator. Signed-off-by: Mike Christie Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Nicholas Bellinger --- include/target/target_core_backend.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index 56cf8e485ef2..28ee5c2e6bcd 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -94,5 +94,8 @@ sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd, sense_reason_t (*exec_cmd)(struct se_cmd *cmd)); bool target_sense_desc_format(struct se_device *dev); +sector_t target_to_linux_sector(struct se_device *dev, sector_t lb); +bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib, + struct request_queue *q, int block_size); #endif /* TARGET_CORE_BACKEND_H */ -- cgit v1.2.3 From 6f21c96a78b835259546d8f3fb4edff0f651d478 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 29 Jan 2016 12:30:19 +0100 Subject: ipv6: enforce flowi6_oif usage in ip6_dst_lookup_tail() The current implementation of ip6_dst_lookup_tail basically ignore the egress ifindex match: if the saddr is set, ip6_route_output() purposefully ignores flowi6_oif, due to the commit d46a9d678e4c ("net: ipv6: Dont add RT6_LOOKUP_F_IFACE flag if saddr set"), if the saddr is 'any' the first route lookup in ip6_dst_lookup_tail fails, but upon failure a second lookup will be performed with saddr set, thus ignoring the ifindex constraint. This commit adds an output route lookup function variant, which allows the caller to specify lookup flags, and modify ip6_dst_lookup_tail() to enforce the ifindex match on the second lookup via said helper. ip6_route_output() becames now a static inline function build on top of ip6_route_output_flags(); as a side effect, out-of-tree modules need now a GPL license to access the output route lookup functionality. Signed-off-by: Paolo Abeni Acked-by: Hannes Frederic Sowa Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 877f682989b8..295d291269e2 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -64,8 +64,16 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) void ip6_route_input(struct sk_buff *skb); -struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, - struct flowi6 *fl6); +struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, + struct flowi6 *fl6, int flags); + +static inline struct dst_entry *ip6_route_output(struct net *net, + const struct sock *sk, + struct flowi6 *fl6) +{ + return ip6_route_output_flags(net, sk, fl6, 0); +} + struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, int flags); -- cgit v1.2.3 From 65f87ee71852a754f7981d0653e7136039b8798a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 25 Jan 2016 17:23:18 -0800 Subject: fs, block: force direct-I/O for dax-enabled block devices Similar to the file I/O path, re-direct all I/O to the DAX path for I/O to a block-device special file. Both regular files and device special files can use the common filp->f_mapping->host lookup to determing is DAX is enabled. Otherwise, we confuse the DAX code that does not expect to find live data in the page cache: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 7676 at mm/filemap.c:217 __delete_from_page_cache+0x9f6/0xb60() Modules linked in: CPU: 0 PID: 7676 Comm: a.out Not tainted 4.4.0+ #276 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 00000000ffffffff ffff88006d3f7738 ffffffff82999e2d 0000000000000000 ffff8800620a0000 ffffffff86473d20 ffff88006d3f7778 ffffffff81352089 ffffffff81658d36 ffffffff86473d20 00000000000000d9 ffffea0000009d60 Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x6f/0xa2 lib/dump_stack.c:50 [] warn_slowpath_common+0xd9/0x140 kernel/panic.c:482 [] warn_slowpath_null+0x29/0x30 kernel/panic.c:515 [] __delete_from_page_cache+0x9f6/0xb60 mm/filemap.c:217 [] delete_from_page_cache+0x112/0x200 mm/filemap.c:244 [] __dax_fault+0x859/0x1800 fs/dax.c:487 [] blkdev_dax_fault+0x26/0x30 fs/block_dev.c:1730 [< inline >] wp_pfn_shared mm/memory.c:2208 [] do_wp_page+0xc85/0x14f0 mm/memory.c:2307 [< inline >] handle_pte_fault mm/memory.c:3323 [< inline >] __handle_mm_fault mm/memory.c:3417 [] handle_mm_fault+0x2483/0x4640 mm/memory.c:3446 [] __do_page_fault+0x376/0x960 arch/x86/mm/fault.c:1238 [] trace_do_page_fault+0xe8/0x420 arch/x86/mm/fault.c:1331 [] do_async_page_fault+0x14/0xd0 arch/x86/kernel/kvm.c:264 [] async_page_fault+0x28/0x30 arch/x86/entry/entry_64.S:986 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 ---[ end trace dae21e0f85f1f98c ]--- Fixes: 5a023cdba50c ("block: enable dax for raw block devices") Reported-by: Dmitry Vyukov Reported-by: Kirill A. Shutemov Suggested-by: Jan Kara Reviewed-by: Jan Kara Suggested-by: Matthew Wilcox Tested-by: Ross Zwisler Signed-off-by: Dan Williams --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a2046275cdf..b10002d4a5f5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2907,7 +2907,7 @@ extern void replace_mount_options(struct super_block *sb, char *options); static inline bool io_is_direct(struct file *filp) { - return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp)); + return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host); } static inline int iocb_flags(struct file *file) -- cgit v1.2.3 From 9f4736fe7ca804aa79b5916221bb13dfc6221a0f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:13:39 -0800 Subject: block: revert runtime dax control of the raw block device Dynamically enabling DAX requires that the page cache first be flushed and invalidated. This must occur atomically with the change of DAX mode otherwise we confuse the fsync/msync tracking and violate data durability guarantees. Eliminate the possibilty of DAX-disabled to DAX-enabled transitions for now and revisit this for the next cycle. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox Cc: Andrew Morton Cc: Ross Zwisler Signed-off-by: Dan Williams --- include/linux/fs.h | 3 --- include/uapi/linux/fs.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index b10002d4a5f5..ae681002100a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -484,9 +484,6 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -#ifdef CONFIG_FS_DAX - int bd_map_count; -#endif }; /* diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 41e0433b4a83..149bec83a907 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -222,7 +222,6 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) -#define BLKDAXSET _IO(0x12,128) #define BLKDAXGET _IO(0x12,129) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ -- cgit v1.2.3 From d1a5f2b4d8a125943dcb6b032fc7eaefc2c78296 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:25:31 -0800 Subject: block: use DAX for partition table reads Avoid populating pagecache when the block device is in DAX mode. Otherwise these page cache entries collide with the fsync/msync implementation and break data durability guarantees. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Andrew Morton Reported-by: Ross Zwisler Tested-by: Ross Zwisler Reviewed-by: Matthew Wilcox Signed-off-by: Dan Williams --- include/linux/dax.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 8204c3dc3800..818e45078929 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,17 @@ int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); + +#ifdef CONFIG_FS_DAX +struct page *read_dax_sector(struct block_device *bdev, sector_t n); +#else +static inline struct page *read_dax_sector(struct block_device *bdev, + sector_t n) +{ + return ERR_PTR(-ENXIO); +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, unsigned int flags, get_block_t, dax_iodone_t); -- cgit v1.2.3 From 76e9f0ee52b0be5761e29847e0ef01f23f24f1df Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Jan 2016 09:43:28 -0800 Subject: phys_to_pfn_t: use phys_addr_t A dma_addr_t is potentially smaller than a phys_addr_t on some archs. Don't truncate the address when doing the pfn conversion. Cc: Ross Zwisler Reported-by: Matthew Wilcox [willy: fix pfn_t_to_phys as well] Signed-off-by: Dan Williams --- include/linux/pfn_t.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 0703b5360d31..37448ab5fb5c 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -29,7 +29,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); +extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags); static inline bool pfn_t_has_page(pfn_t pfn) { @@ -48,7 +48,7 @@ static inline struct page *pfn_t_to_page(pfn_t pfn) return NULL; } -static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) +static inline phys_addr_t pfn_t_to_phys(pfn_t pfn) { return PFN_PHYS(pfn_t_to_pfn(pfn)); } -- cgit v1.2.3 From 0f26922fe5dc5724b1adbbd54b21bad03590b4f3 Mon Sep 17 00:00:00 2001 From: zengtao Date: Tue, 2 Feb 2016 11:38:34 +0800 Subject: cputime: Prevent 32bit overflow in time[val|spec]_to_cputime() The datatype __kernel_time_t is u32 on 32bit platform, so its subject to overflows in the timeval/timespec to cputime conversion. Currently the following functions are affected: 1. setitimer() 2. timer_create/timer_settime() 3. sys_clock_nanosleep This can happen on MIPS32 and ARM32 with "Full dynticks CPU time accounting" enabled, which is required for CONFIG_NO_HZ_FULL. Enforce u64 conversion to prevent the overflow. Fixes: 31c1fc818715 ("ARM: Kconfig: allow full nohz CPU accounting") Signed-off-by: zengtao Reviewed-by: Arnd Bergmann Cc: Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1454384314-154784-1-git-send-email-prime.zeng@huawei.com Signed-off-by: Thomas Gleixner --- include/asm-generic/cputime_nsecs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 0419485891f2..0f1c6f315cdc 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -75,7 +75,7 @@ typedef u64 __nocast cputime64_t; */ static inline cputime_t timespec_to_cputime(const struct timespec *val) { - u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_nsec; + u64 ret = (u64)val->tv_sec * NSEC_PER_SEC + val->tv_nsec; return (__force cputime_t) ret; } static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) @@ -91,7 +91,8 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) */ static inline cputime_t timeval_to_cputime(const struct timeval *val) { - u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC; + u64 ret = (u64)val->tv_sec * NSEC_PER_SEC + + val->tv_usec * NSEC_PER_USEC; return (__force cputime_t) ret; } static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) -- cgit v1.2.3 From 4b0e4e4af6c6dc8354dcb72182d52c1bc55f12fc Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Sat, 30 Jan 2016 07:59:32 +0200 Subject: drm: add helper to check for wc memory support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Christian König Reviewed-by: Michel Dänzer Signed-off-by: Dave Airlie Signed-off-by: Oded Gabbay Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- include/drm/drm_cache.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/drm/drm_cache.h b/include/drm/drm_cache.h index 7bfb063029d8..461a0558bca4 100644 --- a/include/drm/drm_cache.h +++ b/include/drm/drm_cache.h @@ -35,4 +35,13 @@ void drm_clflush_pages(struct page *pages[], unsigned long num_pages); +static inline bool drm_arch_can_wc_memory(void) +{ +#if defined(CONFIG_PPC) && !defined(CONFIG_NOT_COHERENT_CACHE) + return false; +#else + return true; +#endif +} + #endif -- cgit v1.2.3 From 2db8f9a1d86aa32a9cbf1a975882b4fa162f040f Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Tue, 26 Jan 2016 12:43:00 -0600 Subject: ACPI / CPPC: remove redundant mbox_send_message() declaration Remove a redundant function declaration in cppc_acpi.h for mbox_send_message(). That function is defined in mailbox_client.h, which is already included. Signed-off-by: Timur Tabi Signed-off-by: Rafael J. Wysocki --- include/acpi/cppc_acpi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 717a29810473..dad8af3ebeb5 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -133,6 +133,5 @@ extern int acpi_get_psd_map(struct cpudata **); /* Methods to interact with the PCC mailbox controller. */ extern struct mbox_chan * pcc_mbox_request_channel(struct mbox_client *, unsigned int); -extern int mbox_send_message(struct mbox_chan *chan, void *mssg); #endif /* _CPPC_ACPI_H*/ -- cgit v1.2.3 From 8244062ef1e54502ef55f54cced659913f244c3e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 3 Feb 2016 16:55:26 +1030 Subject: modules: fix longstanding /proc/kallsyms vs module insertion race. For CONFIG_KALLSYMS, we keep two symbol tables and two string tables. There's one full copy, marked SHF_ALLOC and laid out at the end of the module's init section. There's also a cut-down version that only contains core symbols and strings, and lives in the module's core section. After module init (and before we free the module memory), we switch the mod->symtab, mod->num_symtab and mod->strtab to point to the core versions. We do this under the module_mutex. However, kallsyms doesn't take the module_mutex: it uses preempt_disable() and rcu tricks to walk through the modules, because it's used in the oops path. It's also used in /proc/kallsyms. There's nothing atomic about the change of these variables, so we can get the old (larger!) num_symtab and the new symtab pointer; in fact this is what I saw when trying to reproduce. By grouping these variables together, we can use a carefully-dereferenced pointer to ensure we always get one or the other (the free of the module init section is already done in an RCU callback, so that's safe). We allocate the init one at the end of the module init section, and keep the core one inside the struct module itself (it could also have been allocated at the end of the module core, but that's probably overkill). Reported-by: Weilong Chen Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111541 Cc: stable@kernel.org Signed-off-by: Rusty Russell --- include/linux/module.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index 4560d8f1545d..2bb0c3085706 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -324,6 +324,12 @@ struct module_layout { #define __module_layout_align #endif +struct mod_kallsyms { + Elf_Sym *symtab; + unsigned int num_symtab; + char *strtab; +}; + struct module { enum module_state state; @@ -405,15 +411,10 @@ struct module { #endif #ifdef CONFIG_KALLSYMS - /* - * We keep the symbol and string tables for kallsyms. - * The core_* fields below are temporary, loader-only (they - * could really be discarded after module init). - */ - Elf_Sym *symtab, *core_symtab; - unsigned int num_symtab, core_num_syms; - char *strtab, *core_strtab; - + /* Protected by RCU and/or module_mutex: use rcu_dereference() */ + struct mod_kallsyms *kallsyms; + struct mod_kallsyms core_kallsyms; + /* Section attributes */ struct module_sect_attrs *sect_attrs; -- cgit v1.2.3 From 06ab30034ed9c200a570ab13c017bde248ddb2a6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 31 Jan 2016 11:57:41 +0100 Subject: ALSA: rawmidi: Make snd_rawmidi_transmit() race-free A kernel WARNING in snd_rawmidi_transmit_ack() is triggered by syzkaller fuzzer: WARNING: CPU: 1 PID: 20739 at sound/core/rawmidi.c:1136 Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x6f/0xa2 lib/dump_stack.c:50 [] warn_slowpath_common+0xd9/0x140 kernel/panic.c:482 [] warn_slowpath_null+0x29/0x30 kernel/panic.c:515 [] snd_rawmidi_transmit_ack+0x275/0x400 sound/core/rawmidi.c:1136 [] snd_virmidi_output_trigger+0x4b1/0x5a0 sound/core/seq/seq_virmidi.c:163 [< inline >] snd_rawmidi_output_trigger sound/core/rawmidi.c:150 [] snd_rawmidi_kernel_write1+0x549/0x780 sound/core/rawmidi.c:1223 [] snd_rawmidi_write+0x543/0xb30 sound/core/rawmidi.c:1273 [] __vfs_write+0x113/0x480 fs/read_write.c:528 [] vfs_write+0x167/0x4a0 fs/read_write.c:577 [< inline >] SYSC_write fs/read_write.c:624 [] SyS_write+0x111/0x220 fs/read_write.c:616 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 Also a similar warning is found but in another path: Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x6f/0xa2 lib/dump_stack.c:50 [] warn_slowpath_common+0xd9/0x140 kernel/panic.c:482 [] warn_slowpath_null+0x29/0x30 kernel/panic.c:515 [] rawmidi_transmit_ack+0x24a/0x3b0 sound/core/rawmidi.c:1133 [] snd_rawmidi_transmit_ack+0x51/0x80 sound/core/rawmidi.c:1163 [] snd_virmidi_output_trigger+0x2b6/0x570 sound/core/seq/seq_virmidi.c:185 [< inline >] snd_rawmidi_output_trigger sound/core/rawmidi.c:150 [] snd_rawmidi_kernel_write1+0x4bb/0x760 sound/core/rawmidi.c:1252 [] snd_rawmidi_write+0x543/0xb30 sound/core/rawmidi.c:1302 [] __vfs_write+0x113/0x480 fs/read_write.c:528 [] vfs_write+0x167/0x4a0 fs/read_write.c:577 [< inline >] SYSC_write fs/read_write.c:624 [] SyS_write+0x111/0x220 fs/read_write.c:616 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 In the former case, the reason is that virmidi has an open code calling snd_rawmidi_transmit_ack() with the value calculated outside the spinlock. We may use snd_rawmidi_transmit() in a loop just for consuming the input data, but even there, there is a race between snd_rawmidi_transmit_peek() and snd_rawmidi_tranmit_ack(). Similarly in the latter case, it calls snd_rawmidi_transmit_peek() and snd_rawmidi_tranmit_ack() separately without protection, so they are racy as well. The patch tries to address these issues by the following ways: - Introduce the unlocked versions of snd_rawmidi_transmit_peek() and snd_rawmidi_transmit_ack() to be called inside the explicit lock. - Rewrite snd_rawmidi_transmit() to be race-free (the former case). - Make the split calls (the latter case) protected in the rawmidi spin lock. BugLink: http://lkml.kernel.org/r/CACT4Y+YPq1+cYLkadwjWa5XjzF1_Vki1eHnVn-Lm0hzhSpu5PA@mail.gmail.com BugLink: http://lkml.kernel.org/r/CACT4Y+acG4iyphdOZx47Nyq_VHGbpJQK-6xNpiqUjaZYqsXOGw@mail.gmail.com Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Cc: Signed-off-by: Takashi Iwai --- include/sound/rawmidi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h index fdabbb4ddba9..f730b91e472f 100644 --- a/include/sound/rawmidi.h +++ b/include/sound/rawmidi.h @@ -167,6 +167,10 @@ int snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream, int snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count); int snd_rawmidi_transmit(struct snd_rawmidi_substream *substream, unsigned char *buffer, int count); +int __snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream, + unsigned char *buffer, int count); +int __snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, + int count); /* main midi functions */ -- cgit v1.2.3 From a3d0a918502cc73af4f60da2cc4c5cac5573f183 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 2 Feb 2016 16:57:08 -0800 Subject: thp: make split_queue per-node Andrea Arcangeli suggested to make split queue per-node to improve scalability. Let's do it. Signed-off-by: Kirill A. Shutemov Suggested-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 33bb1b19273e..7b6c2cfee390 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -682,6 +682,12 @@ typedef struct pglist_data { */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spinlock_t split_queue_lock; + struct list_head split_queue; + unsigned long split_queue_len; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) -- cgit v1.2.3 From 65376df582174ffcec9e6471bf5b0dd79ba05e4a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 2 Feb 2016 16:57:29 -0800 Subject: proc: revert /proc//maps [stack:TID] annotation Commit b76437579d13 ("procfs: mark thread stack correctly in proc//maps") added [stack:TID] annotation to /proc//maps. Finding the task of a stack VMA requires walking the entire thread list, turning this into quadratic behavior: a thousand threads means a thousand stacks, so the rendering of /proc//maps needs to look at a million combinations. The cost is not in proportion to the usefulness as described in the patch. Drop the [stack:TID] annotation to make /proc//maps (and /proc//numa_maps) usable again for higher thread counts. The [stack] annotation inside /proc//task//maps is retained, as identifying the stack VMA there is an O(1) operation. Siddesh said: "The end users needed a way to identify thread stacks programmatically and there wasn't a way to do that. I'm afraid I no longer remember (or have access to the resources that would aid my memory since I changed employers) the details of their requirement. However, I did do this on my own time because I thought it was an interesting project for me and nobody really gave any feedback then as to its utility, so as far as I am concerned you could roll back the main thread maps information since the information is available in the thread-specific files" Signed-off-by: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Siddhesh Poyarekar Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f1cd22f2df1a..0b50d7848e3a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1341,8 +1341,7 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma, !vma_growsup(vma->vm_next, addr); } -extern struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group); +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, -- cgit v1.2.3 From c792e8240338e41eda4d06a3a71a7bb7af4e6156 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 2 Feb 2016 16:57:38 -0800 Subject: mm: memcontrol: drop superfluous entry in the per-memcg stats array MEM_CGROUP_STAT_NSTATS is just a delimiter for cgroup1 statistics, not an actual array entry. Reuse it for the first cgroup2 stat entry, like in the event array. Fixes: b2807f07f4f8 ("mm: memcontrol: add "sock" to cgroup2 memory.stat") Signed-off-by: Johannes Weiner Cc: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9ae48d4aeb5e..792c8981e633 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -51,7 +51,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ - MEMCG_SOCK, + MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, MEMCG_NR_STAT, }; -- cgit v1.2.3 From 30bdbb78009e67767983085e302bec6d97afc679 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 2 Feb 2016 16:57:46 -0800 Subject: mm: polish virtual memory accounting * add VM_STACK as alias for VM_GROWSUP/DOWN depending on architecture * always account VMAs with flag VM_STACK as stack (as it was before) * cleanup classifying helpers * update comments and documentation Signed-off-by: Konstantin Khlebnikov Tested-by: Sudip Mukherjee Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++-- include/linux/mm_types.h | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b50d7848e3a..516e14944339 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -201,11 +201,13 @@ extern unsigned int kobjsize(const void *objp); #endif #ifdef CONFIG_STACK_GROWSUP -#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#define VM_STACK VM_GROWSUP #else -#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#define VM_STACK VM_GROWSDOWN #endif +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + /* * Special vmas that are non-mergable, non-mlock()able. * Note: mm/huge_memory.c VM_NO_THP depends on this definition. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d3ebb9d21a53..624b78b848b8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -424,9 +424,9 @@ struct mm_struct { unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ unsigned long pinned_vm; /* Refcount permanently increased */ - unsigned long data_vm; /* VM_WRITE & ~VM_SHARED/GROWSDOWN */ - unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ - unsigned long stack_vm; /* VM_GROWSUP/DOWN */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ + unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ + unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; -- cgit v1.2.3 From 46437f9a554fbe3e110580ca08ab703b59f2f95a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 2 Feb 2016 16:57:52 -0800 Subject: radix-tree: fix race in gang lookup If the indirect_ptr bit is set on a slot, that indicates we need to redo the lookup. Introduce a new function radix_tree_iter_retry() which forces the loop to retry the lookup by setting 'slot' to NULL and turning the iterator back to point at the problematic entry. This is a pretty rare problem to hit at the moment; the lookup has to race with a grow of the radix tree from a height of 0. The consequences of hitting this race are that gang lookup could return a pointer to a radix_tree_node instead of a pointer to whatever the user had inserted in the tree. Fixes: cebbd29e1c2f ("radix-tree: rewrite gang lookup using iterator") Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Cc: Ohad Ben-Cohen Cc: Konstantin Khlebnikov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 7c88ad156a29..00b17c526c1f 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -378,6 +378,22 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start) void **radix_tree_next_chunk(struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags); +/** + * radix_tree_iter_retry - retry this chunk of the iteration + * @iter: iterator state + * + * If we iterate over a tree protected only by the RCU lock, a race + * against deletion or creation may result in seeing a slot for which + * radix_tree_deref_retry() returns true. If so, call this function + * and continue the iteration. + */ +static inline __must_check +void **radix_tree_iter_retry(struct radix_tree_iter *iter) +{ + iter->next_index = iter->index; + return NULL; +} + /** * radix_tree_chunk_size - get current chunk size * -- cgit v1.2.3 From febe562c20dfa8f33bee7d419c6b517986a5aa33 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 11 Jan 2016 21:31:09 -0800 Subject: target: Fix LUN_RESET active I/O handling for ACK_KREF This patch fixes a NULL pointer se_cmd->cmd_kref < 0 refcount bug during TMR LUN_RESET with active se_cmd I/O, that can be triggered during se_cmd descriptor shutdown + release via core_tmr_drain_state_list() code. To address this bug, add common __target_check_io_state() helper for ABORT_TASK + LUN_RESET w/ CMD_T_COMPLETE checking, and set CMD_T_ABORTED + obtain ->cmd_kref for both cases ahead of last target_put_sess_cmd() after TFO->aborted_task() -> transport_cmd_finish_abort() callback has completed. It also introduces SCF_ACK_KREF to determine when transport_cmd_finish_abort() needs to drop the second extra reference, ahead of calling target_put_sess_cmd() for the final kref_put(&se_cmd->cmd_kref). It also updates transport_cmd_check_stop() to avoid holding se_cmd->t_state_lock while dropping se_cmd device state via target_remove_from_state_list(), now that core_tmr_drain_state_list() is holding the se_device lock while checking se_cmd state from within TMR logic. Finally, move transport_put_cmd() release of SGL + TMR + extended CDB memory into target_free_cmd_mem() in order to avoid potential resource leaks in TMR ABORT_TASK + LUN_RESET code-paths. Also update target_release_cmd_kref() accordingly. Reviewed-by: Quinn Tran Cc: Himanshu Madhani Cc: Sagi Grimberg Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Andy Grover Cc: Mike Christie Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Nicholas Bellinger --- include/target/target_core_base.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 5d82816cc4e3..1a76726c45a2 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -140,6 +140,7 @@ enum se_cmd_flags_table { SCF_COMPARE_AND_WRITE = 0x00080000, SCF_COMPARE_AND_WRITE_POST = 0x00100000, SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC = 0x00200000, + SCF_ACK_KREF = 0x00400000, }; /* struct se_dev_entry->lun_flags and struct se_lun->lun_access */ -- cgit v1.2.3 From fac710e45d0b12443acd4d905c9acec27ab4ca14 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 27 Jan 2016 10:08:42 -0200 Subject: [media] vb2: fix nasty vb2_thread regression The vb2_thread implementation was made generic and was moved from videobuf2-v4l2.c to videobuf2-core.c in commit af3bac1a. Unfortunately that clearly was never tested since it broke read() causing NULL address references. The root cause was confused handling of vb2_buffer vs v4l2_buffer (the pb pointer in various core functions). The v4l2_buffer no longer exists after moving the code into the core and it is no longer needed. However, the vb2_thread code passed a pointer to a vb2_buffer to the core functions were a v4l2_buffer pointer was expected and vb2_thread expected that the vb2_buffer fields would be filled in correctly. This is obviously wrong since v4l2_buffer != vb2_buffer. Note that the pb pointer is a void pointer, so no type-checking took place. This patch fixes this problem: 1) allow pb to be NULL for vb2_core_(d)qbuf. The vb2_thread code will use a NULL pointer here since they don't care about v4l2_buffer anyway. 2) let vb2_core_dqbuf pass back the index of the received buffer. This is all vb2_thread needs: this index is the index into the q->bufs array and vb2_thread just gets the vb2_buffer from there. 3) the fileio->b pointer (that originally contained a v4l2_buffer) is removed altogether since it is no longer needed. Tested with vivid and the cobalt driver. Cc: stable@vger.kernel.org # Kernel >= 4.3 Signed-off-by: Hans Verkuil Reported-by: Matthias Schwarzott Signed-off-by: Mauro Carvalho Chehab --- include/media/videobuf2-core.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h index ef03ae56b1c1..8a0f55b6c2ba 100644 --- a/include/media/videobuf2-core.h +++ b/include/media/videobuf2-core.h @@ -533,7 +533,8 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, const unsigned int requested_sizes[]); int vb2_core_prepare_buf(struct vb2_queue *q, unsigned int index, void *pb); int vb2_core_qbuf(struct vb2_queue *q, unsigned int index, void *pb); -int vb2_core_dqbuf(struct vb2_queue *q, void *pb, bool nonblocking); +int vb2_core_dqbuf(struct vb2_queue *q, unsigned int *pindex, void *pb, + bool nonblocking); int vb2_core_streamon(struct vb2_queue *q, unsigned int type); int vb2_core_streamoff(struct vb2_queue *q, unsigned int type); -- cgit v1.2.3 From dc6ae6d8e7726bad4f1c87244b49cac851746c65 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 31 Jan 2016 14:36:07 +0100 Subject: crush: add chooseleaf_stable tunable Add a tunable to fix the bug that chooseleaf may cause unnecessary pg migrations when some device fails. Reflects ceph.git commit fdb3f664448e80d984470f32f04e2e6f03ab52ec. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 48b49305716b..be8f12b8f195 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -59,7 +59,8 @@ enum { CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, - CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 + CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12, + CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13 }; /* @@ -205,6 +206,11 @@ struct crush_map { * mappings line up a bit better with previous mappings. */ __u8 chooseleaf_vary_r; + /* if true, it makes chooseleaf firstn to return stable results (if + * no local retry) so that data migrations would be optimal when some + * device fails. */ + __u8 chooseleaf_stable; + #ifndef __KERNEL__ /* * version 0 (original) of straw_calc has various flaws. version 1 -- cgit v1.2.3 From 97db9a88186e3a7d3a1942370c836bf221d3ab90 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 1 Feb 2016 17:08:33 +0100 Subject: libceph: advertise support for TUNABLES5 Add TUNABLES5 feature (chooseleaf_stable tunable) to a set of features supported by default. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index f89b31d45cc8..c3b211c9fe83 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -63,6 +63,16 @@ #define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ +#define CEPH_FEATURE_MON_METADATA (1ULL<<50) +#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */ +#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) +#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) +#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) +#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) +#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */ +#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */ +#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */ +#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature @@ -108,7 +118,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \ - CEPH_FEATURE_CRUSH_V4) + CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATURE_CRUSH_TUNABLES5) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.3 From b0b31a8ffe54abf0a455bcaee54dd92f08817164 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 3 Feb 2016 15:25:48 +0100 Subject: libceph: MOSDOpReply v7 encoding Empty request_redirect_t (struct ceph_request_redirect in the kernel client) is now encoded with a bool. NEW_OSDOPREPLY_ENCODING feature bit overlaps with already supported CRUSH_TUNABLES5. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index c3b211c9fe83..c1ef6f14e7be 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -73,6 +73,8 @@ #define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */ #define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */ #define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ +// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5 +#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */ /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature @@ -119,7 +121,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \ CEPH_FEATURE_CRUSH_V4 | \ - CEPH_FEATURE_CRUSH_TUNABLES5) + CEPH_FEATURE_CRUSH_TUNABLES5 | \ + CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.3 From 0fb5b1fb30fba3671dd5b1489d78e93e08d62e4e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 4 Feb 2016 00:52:12 -0500 Subject: block/sd: Return -EREMOTEIO when WRITE SAME and DISCARD are disabled When a storage device rejects a WRITE SAME command we will disable write same functionality for the device and return -EREMOTEIO to the block layer. -EREMOTEIO will in turn prevent DM from retrying the I/O and/or failing the path. Yiwen Jiang discovered a small race where WRITE SAME requests issued simultaneously would cause -EIO to be returned. This happened because any requests being prepared after WRITE SAME had been disabled for the device caused us to return BLKPREP_KILL. The latter caused the block layer to return -EIO upon completion. To overcome this we introduce BLKPREP_INVALID which indicates that this is an invalid request for the device. blk_peek_request() is modified to return -EREMOTEIO in that case. Reported-by: Yiwen Jiang Suggested-by: Mike Snitzer Reviewed-by: Hannes Reinicke Reviewed-by: Ewan Milne Reviewed-by: Yiwen Jiang Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d372ea87ead5..a9b643a0647f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -681,9 +681,12 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) /* * q->prep_rq_fn return values */ -#define BLKPREP_OK 0 /* serve it */ -#define BLKPREP_KILL 1 /* fatal error, kill */ -#define BLKPREP_DEFER 2 /* leave on queue */ +enum { + BLKPREP_OK, /* serve it */ + BLKPREP_KILL, /* fatal error, kill, return -EIO */ + BLKPREP_DEFER, /* leave on queue */ + BLKPREP_INVALID, /* invalid command, kill, return -EREMOTEIO */ +}; extern unsigned long blk_max_low_pfn, blk_max_pfn; -- cgit v1.2.3 From 64566b5e767f9bc3161055ca1b443a51afb52aad Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Fri, 22 Jan 2016 17:07:25 -0500 Subject: drm: Add drm_fixp_from_fraction and drm_fixp2int_ceil drm_fixp_from_fraction allows us to create a fixed point directly from a fraction, rather than creating fixed point values and dividing later. This avoids overflow of our 64 bit value for large numbers. drm_fixp2int_ceil allows us to return the ceiling of our fixed point value. [airlied: squash Jordan's fix] 32-bit-build-fix: Jordan Lazare Signed-off-by: Harry Wentland Cc: stable@vger.kernel.org Reviewed-by: Alex Deucher Signed-off-by: Dave Airlie --- include/drm/drm_fixed.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/drm/drm_fixed.h b/include/drm/drm_fixed.h index d639049a613d..553210c02ee0 100644 --- a/include/drm/drm_fixed.h +++ b/include/drm/drm_fixed.h @@ -73,18 +73,28 @@ static inline u32 dfixed_div(fixed20_12 A, fixed20_12 B) #define DRM_FIXED_ONE (1ULL << DRM_FIXED_POINT) #define DRM_FIXED_DECIMAL_MASK (DRM_FIXED_ONE - 1) #define DRM_FIXED_DIGITS_MASK (~DRM_FIXED_DECIMAL_MASK) +#define DRM_FIXED_EPSILON 1LL +#define DRM_FIXED_ALMOST_ONE (DRM_FIXED_ONE - DRM_FIXED_EPSILON) static inline s64 drm_int2fixp(int a) { return ((s64)a) << DRM_FIXED_POINT; } -static inline int drm_fixp2int(int64_t a) +static inline int drm_fixp2int(s64 a) { return ((s64)a) >> DRM_FIXED_POINT; } -static inline unsigned drm_fixp_msbset(int64_t a) +static inline int drm_fixp2int_ceil(s64 a) +{ + if (a > 0) + return drm_fixp2int(a + DRM_FIXED_ALMOST_ONE); + else + return drm_fixp2int(a - DRM_FIXED_ALMOST_ONE); +} + +static inline unsigned drm_fixp_msbset(s64 a) { unsigned shift, sign = (a >> 63) & 1; @@ -136,6 +146,45 @@ static inline s64 drm_fixp_div(s64 a, s64 b) return result; } +static inline s64 drm_fixp_from_fraction(s64 a, s64 b) +{ + s64 res; + bool a_neg = a < 0; + bool b_neg = b < 0; + u64 a_abs = a_neg ? -a : a; + u64 b_abs = b_neg ? -b : b; + u64 rem; + + /* determine integer part */ + u64 res_abs = div64_u64_rem(a_abs, b_abs, &rem); + + /* determine fractional part */ + { + u32 i = DRM_FIXED_POINT; + + do { + rem <<= 1; + res_abs <<= 1; + if (rem >= b_abs) { + res_abs |= 1; + rem -= b_abs; + } + } while (--i != 0); + } + + /* round up LSB */ + { + u64 summand = (rem << 1) >= b_abs; + + res_abs += summand; + } + + res = (s64) res_abs; + if (a_neg ^ b_neg) + res = -res; + return res; +} + static inline s64 drm_fixp_exp(s64 x) { s64 tolerance = div64_s64(DRM_FIXED_ONE, 1000000); -- cgit v1.2.3 From 5e93b8208d3c419b515fb75e2601931c027e12ab Mon Sep 17 00:00:00 2001 From: Hersen Wu Date: Fri, 22 Jan 2016 17:07:28 -0500 Subject: drm/dp/mst: move GUID storage from mgr, port to only mst branch Previous implementation does not handle case below: boot up one MST branch to DP connector of ASIC. After boot up, hot plug 2nd MST branch to DP output of 1st MST, GUID is not created for 2nd MST branch. When downstream port of 2nd MST branch send upstream request, it fails because 2nd MST branch GUID is not available. New Implementation: only create GUID for MST branch and save it within Branch. Signed-off-by: Hersen Wu Reviewed-by: Harry Wentland Cc: stable@vger.kernel.org Acked-by: Alex Deucher Signed-off-by: Dave Airlie --- include/drm/drm_dp_mst_helper.h | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/drm/drm_dp_mst_helper.h b/include/drm/drm_dp_mst_helper.h index 24ab1787b771..fdb47051d549 100644 --- a/include/drm/drm_dp_mst_helper.h +++ b/include/drm/drm_dp_mst_helper.h @@ -44,8 +44,6 @@ struct drm_dp_vcpi { /** * struct drm_dp_mst_port - MST port * @kref: reference count for this port. - * @guid_valid: for DP 1.2 devices if we have validated the GUID. - * @guid: guid for DP 1.2 device on this port. * @port_num: port number * @input: if this port is an input port. * @mcs: message capability status - DP 1.2 spec. @@ -70,10 +68,6 @@ struct drm_dp_vcpi { struct drm_dp_mst_port { struct kref kref; - /* if dpcd 1.2 device is on this port - its GUID info */ - bool guid_valid; - u8 guid[16]; - u8 port_num; bool input; bool mcs; @@ -110,10 +104,12 @@ struct drm_dp_mst_port { * @tx_slots: transmission slots for this device. * @last_seqno: last sequence number used to talk to this. * @link_address_sent: if a link address message has been sent to this device yet. + * @guid: guid for DP 1.2 branch device. port under this branch can be + * identified by port #. * * This structure represents an MST branch device, there is one - * primary branch device at the root, along with any others connected - * to downstream ports + * primary branch device at the root, along with any other branches connected + * to downstream port of parent branches. */ struct drm_dp_mst_branch { struct kref kref; @@ -132,6 +128,9 @@ struct drm_dp_mst_branch { struct drm_dp_sideband_msg_tx *tx_slots[2]; int last_seqno; bool link_address_sent; + + /* global unique identifier to identify branch devices */ + u8 guid[16]; }; @@ -406,11 +405,9 @@ struct drm_dp_payload { * @conn_base_id: DRM connector ID this mgr is connected to. * @down_rep_recv: msg receiver state for down replies. * @up_req_recv: msg receiver state for up requests. - * @lock: protects mst state, primary, guid, dpcd. + * @lock: protects mst state, primary, dpcd. * @mst_state: if this manager is enabled for an MST capable port. * @mst_primary: pointer to the primary branch device. - * @guid_valid: GUID valid for the primary branch device. - * @guid: GUID for primary port. * @dpcd: cache of DPCD for primary port. * @pbn_div: PBN to slots divisor. * @@ -432,13 +429,11 @@ struct drm_dp_mst_topology_mgr { struct drm_dp_sideband_msg_rx up_req_recv; /* pointer to info about the initial MST device */ - struct mutex lock; /* protects mst_state + primary + guid + dpcd */ + struct mutex lock; /* protects mst_state + primary + dpcd */ bool mst_state; struct drm_dp_mst_branch *mst_primary; - /* primary MST device GUID */ - bool guid_valid; - u8 guid[16]; + u8 dpcd[DP_RECEIVER_CAP_SIZE]; u8 sink_count; int pbn_div; -- cgit v1.2.3 From 0f4a943168f31d29a1701908931acaba518b131a Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Tue, 19 Jan 2016 15:23:02 -0800 Subject: target: Fix remote-port TMR ABORT + se_cmd fabric stop To address the bug where fabric driver level shutdown of se_cmd occurs at the same time when TMR CMD_T_ABORTED is happening resulting in a -1 ->cmd_kref, this patch adds a CMD_T_FABRIC_STOP bit that is used to determine when TMR + driver I_T nexus shutdown is happening concurrently. It changes target_sess_cmd_list_set_waiting() to obtain se_cmd->cmd_kref + set CMD_T_FABRIC_STOP, and drop local reference in target_wait_for_sess_cmds() and invoke extra target_put_sess_cmd() during Task Aborted Status (TAS) when necessary. Also, it adds a new target_wait_free_cmd() wrapper around transport_wait_for_tasks() for the special case within transport_generic_free_cmd() to set CMD_T_FABRIC_STOP, and is now aware of CMD_T_ABORTED + CMD_T_TAS status bits to know when an extra transport_put_cmd() during TAS is required. Note transport_generic_free_cmd() is expected to block on cmd->cmd_wait_comp in order to follow what iscsi-target expects during iscsi_conn context se_cmd shutdown. Cc: Quinn Tran Cc: Himanshu Madhani Cc: Sagi Grimberg Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Andy Grover Cc: Mike Christie Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Nicholas Bellinger --- include/target/target_core_base.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 1a76726c45a2..1579539e5669 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -493,6 +493,8 @@ struct se_cmd { #define CMD_T_DEV_ACTIVE (1 << 7) #define CMD_T_REQUEST_STOP (1 << 8) #define CMD_T_BUSY (1 << 9) +#define CMD_T_TAS (1 << 10) +#define CMD_T_FABRIC_STOP (1 << 11) spinlock_t t_state_lock; struct kref cmd_kref; struct completion t_transport_stop_comp; -- cgit v1.2.3 From 080fe2068e1c7f19f565b30b78baf78edf16a980 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 5 Feb 2016 15:36:41 -0800 Subject: mm, hugetlb: don't require CMA for runtime gigantic pages Commit 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime") has added the runtime gigantic page allocation via alloc_contig_range(), making this support available only when CONFIG_CMA is enabled. Because it doesn't depend on MIGRATE_CMA pageblocks and the associated infrastructure, it is possible with few simple adjustments to require only CONFIG_MEMORY_ISOLATION instead of full CONFIG_CMA. After this patch, alloc_contig_range() and related functions are available and used for gigantic pages with just CONFIG_MEMORY_ISOLATION enabled. Note CONFIG_CMA selects CONFIG_MEMORY_ISOLATION. This allows supporting runtime gigantic pages without the CMA-specific checks in page allocator fastpaths. Signed-off-by: Vlastimil Babka Cc: Luiz Capitulino Cc: Kirill A. Shutemov Cc: Zhang Yanfei Cc: Yasuaki Ishimatsu Cc: Joonsoo Kim Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 28ad5f6494b0..af1f2b24bbe4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -547,16 +547,16 @@ static inline bool pm_suspended_storage(void) } #endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_CMA - +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype); extern void free_contig_range(unsigned long pfn, unsigned nr_pages); +#endif +#ifdef CONFIG_CMA /* CMA stuff */ extern void init_cma_reserved_pageblock(struct page *page); - #endif #endif /* __LINUX_GFP_H */ -- cgit v1.2.3 From 12352d3cae2cebe18805a91fab34b534d7444231 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 5 Feb 2016 15:36:50 -0800 Subject: mm: replace vma_lock_anon_vma with anon_vma_lock_read/write Sequence vma_lock_anon_vma() - vma_unlock_anon_vma() isn't safe if anon_vma appeared between lock and unlock. We have to check anon_vma first or call anon_vma_prepare() to be sure that it's here. There are only few users of these legacy helpers. Let's get rid of them. This patch fixes anon_vma lock imbalance in validate_mm(). Write lock isn't required here, read lock is enough. And reorders expand_downwards/expand_upwards: security_mmap_addr() and wrapping-around check don't have to be under anon vma lock. Link: https://lkml.kernel.org/r/CACT4Y+Y908EjM2z=706dv4rV6dWtxTLK9nFg9_7DhRMLppBo2g@mail.gmail.com Signed-off-by: Konstantin Khlebnikov Reported-by: Dmitry Vyukov Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bdf597c4f0be..a07f42bedda3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -109,20 +109,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma) __put_anon_vma(anon_vma); } -static inline void vma_lock_anon_vma(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) - down_write(&anon_vma->root->rwsem); -} - -static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) - up_write(&anon_vma->root->rwsem); -} - static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { down_write(&anon_vma->root->rwsem); -- cgit v1.2.3 From 732042821cfa106b3c20b9780e4c60fee9d68900 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 5 Feb 2016 15:37:01 -0800 Subject: radix-tree: fix oops after radix_tree_iter_retry Helper radix_tree_iter_retry() resets next_index to the current index. In following radix_tree_next_slot current chunk size becomes zero. This isn't checked and it tries to dereference null pointer in slot. Tagged iterator is fine because retry happens only at slot 0 where tag bitmask in iter->tags is filled with single bit. Fixes: 46437f9a554f ("radix-tree: fix race in gang lookup") Signed-off-by: Konstantin Khlebnikov Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Ohad Ben-Cohen Cc: Jeremiah Mahler Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 00b17c526c1f..f54be7082207 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -400,7 +400,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter) * @iter: pointer to radix tree iterator * Returns: current chunk size */ -static __always_inline unsigned +static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { return iter->next_index - iter->index; @@ -434,9 +434,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) return slot + offset + 1; } } else { - unsigned size = radix_tree_chunk_size(iter) - 1; + long size = radix_tree_chunk_size(iter); - while (size--) { + while (--size > 0) { slot++; iter->index++; if (likely(*slot)) -- cgit v1.2.3 From 57dae19065bde296dfdf08b8e46c102a671ff741 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 29 Jan 2016 00:07:25 -0800 Subject: target: Drop legacy se_cmd->task_stop_comp + REQUEST_STOP usage With CMD_T_FABRIC_STOP + se_cmd->cmd_wait_set usage in place, go ahead and drop left-over CMD_T_REQUEST_STOP checks in target_complete_cmd() and unused target_stop_cmd(). Reviewed-by: Christoph Hellwig Cc: Quinn Tran Cc: Himanshu Madhani Cc: Sagi Grimberg Cc: Hannes Reinecke Cc: Andy Grover Cc: Mike Christie Signed-off-by: Nicholas Bellinger --- include/target/target_core_base.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 1579539e5669..d71a3ead9e63 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -491,7 +491,6 @@ struct se_cmd { #define CMD_T_SENT (1 << 4) #define CMD_T_STOP (1 << 5) #define CMD_T_DEV_ACTIVE (1 << 7) -#define CMD_T_REQUEST_STOP (1 << 8) #define CMD_T_BUSY (1 << 9) #define CMD_T_TAS (1 << 10) #define CMD_T_FABRIC_STOP (1 << 11) @@ -514,9 +513,6 @@ struct se_cmd { struct list_head state_list; - /* old task stop completion, consider merging with some of the above */ - struct completion task_stop_comp; - /* backend private data */ void *priv; -- cgit v1.2.3 From 1f55c718c290616889c04946864a13ef30f64929 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Thu, 14 Jan 2016 17:56:58 -0200 Subject: pty: make sure super_block is still valid in final /dev/tty close Considering current pty code and multiple devpts instances, it's possible to umount a devpts file system while a program still has /dev/tty opened pointing to a previosuly closed pty pair in that instance. In the case all ptmx and pts/N files are closed, umount can be done. If the program closes /dev/tty after umount is done, devpts_kill_index will use now an invalid super_block, which was already destroyed in the umount operation after running ->kill_sb. This is another "use after free" type of issue, but now related to the allocated super_block instance. To avoid the problem (warning at ida_remove and potential crashes) for this specific case, I added two functions in devpts which grabs additional references to the super_block, which pty code now uses so it makes sure the super block structure is still valid until pty shutdown is done. I also moved the additional inode references to the same functions, which also covered similar case with inode being freed before /dev/tty final close/shutdown. Signed-off-by: Herton R. Krzesinski Cc: stable@vger.kernel.org # 2.6.29+ Reviewed-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/devpts_fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 251a2090a554..e0ee0b3000b2 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -19,6 +19,8 @@ int devpts_new_index(struct inode *ptmx_inode); void devpts_kill_index(struct inode *ptmx_inode, int idx); +void devpts_add_ref(struct inode *ptmx_inode); +void devpts_del_ref(struct inode *ptmx_inode); /* mknod in devpts */ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, void *priv); @@ -32,6 +34,8 @@ void devpts_pty_kill(struct inode *inode); /* Dummy stubs in the no-pty case */ static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; } static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { } +static inline void devpts_add_ref(struct inode *ptmx_inode) { } +static inline void devpts_del_ref(struct inode *ptmx_inode) { } static inline struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, void *priv) { -- cgit v1.2.3 From 415e3d3e90ce9e18727e8843ae343eda5a58fad6 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 3 Feb 2016 02:11:03 +0100 Subject: unix: correctly track in-flight fds in sending process user_struct The commit referenced in the Fixes tag incorrectly accounted the number of in-flight fds over a unix domain socket to the original opener of the file-descriptor. This allows another process to arbitrary deplete the original file-openers resource limit for the maximum of open files. Instead the sending processes and its struct cred should be credited. To do so, we add a reference counted struct user_struct pointer to the scm_fp_list and use it to account for the number of inflight unix fds. Fixes: 712f4aad406bb1 ("unix: properly account for FDs passed over unix sockets") Reported-by: David Herrmann Cc: David Herrmann Cc: Willy Tarreau Cc: Linus Torvalds Suggested-by: Linus Torvalds Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/af_unix.h | 4 ++-- include/net/scm.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 2a91a0561a47..9b4c418bebd8 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -6,8 +6,8 @@ #include #include -void unix_inflight(struct file *fp); -void unix_notinflight(struct file *fp); +void unix_inflight(struct user_struct *user, struct file *fp); +void unix_notinflight(struct user_struct *user, struct file *fp); void unix_gc(void); void wait_for_unix_gc(void); struct sock *unix_get_socket(struct file *filp); diff --git a/include/net/scm.h b/include/net/scm.h index 262532d111f5..59fa93c01d2a 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -21,6 +21,7 @@ struct scm_creds { struct scm_fp_list { short count; short max; + struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; -- cgit v1.2.3 From 9cf7490360bf2c46a16b7525f899e4970c5fc144 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Feb 2016 19:31:12 -0800 Subject: tcp: do not drop syn_recv on all icmp reports Petr Novopashenniy reported that ICMP redirects on SYN_RECV sockets were leading to RST. This is of course incorrect. A specific list of ICMP messages should be able to drop a SYN_RECV. For instance, a REDIRECT on SYN_RECV shall be ignored, as we do not hold a dst per SYN_RECV pseudo request. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111751 Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Reported-by: Petr Novopashenniy Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index f6f8f032c73e..ae6468f5c9f3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -447,7 +447,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); -void tcp_req_err(struct sock *sk, u32 seq); +void tcp_req_err(struct sock *sk, u32 seq, bool abort); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, -- cgit v1.2.3 From 5f74f82ea34c0da80ea0b49192bb5ea06e063593 Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Wed, 3 Feb 2016 09:26:57 +0100 Subject: net:Add sysctl_max_skb_frags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devices may have limits on the number of fragments in an skb they support. Current codebase uses a constant as maximum for number of fragments one skb can hold and use. When enabling scatter/gather and running traffic with many small messages the codebase uses the maximum number of fragments and may thereby violate the max for certain devices. The patch introduces a global variable as max number of fragments. Signed-off-by: Hans Westgaard Ry Reviewed-by: Håkon Bugge Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 11f935c1a090..4ce9ff7086f4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -299,6 +299,7 @@ struct sk_buff; #else #define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1) #endif +extern int sysctl_max_skb_frags; typedef struct skb_frag_struct skb_frag_t; -- cgit v1.2.3 From 7e059158d57b79159eaf1f504825d19866ef2c42 Mon Sep 17 00:00:00 2001 From: David Wragg Date: Wed, 10 Feb 2016 00:05:58 +0000 Subject: vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could transmit vxlan packets of any size, constrained only by the ability to send out the resulting packets. 4.3 introduced netdevs corresponding to tunnel vports. These netdevs have an MTU, which limits the size of a packet that can be successfully encapsulated. The default MTU values are low (1500 or less), which is awkwardly small in the context of physical networks supporting jumbo frames, and leads to a conspicuous change in behaviour for userspace. Instead, set the MTU on openvswitch-created netdevs to be the relevant maximum (i.e. the maximum IP packet size minus any relevant overhead), effectively restoring the behaviour prior to 4.3. Signed-off-by: David Wragg Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6db96ea0144f..dda9abf6b89c 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -230,6 +230,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, u8 *protocol, struct flowi4 *fl4); +int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, -- cgit v1.2.3 From 73500267c930baadadb0d02284909731baf151f7 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Mon, 8 Feb 2016 14:48:11 -0500 Subject: lib/ucs2_string: Add ucs2 -> utf8 helper functions This adds ucs2_utf8size(), which tells us how big our ucs2 string is in bytes, and ucs2_as_utf8, which translates from ucs2 to utf8.. Signed-off-by: Peter Jones Tested-by: Lee, Chun-Yi Acked-by: Matthew Garrett Signed-off-by: Matt Fleming --- include/linux/ucs2_string.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/ucs2_string.h b/include/linux/ucs2_string.h index cbb20afdbc01..bb679b48f408 100644 --- a/include/linux/ucs2_string.h +++ b/include/linux/ucs2_string.h @@ -11,4 +11,8 @@ unsigned long ucs2_strlen(const ucs2_char_t *s); unsigned long ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength); int ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len); +unsigned long ucs2_utf8size(const ucs2_char_t *src); +unsigned long ucs2_as_utf8(u8 *dest, const ucs2_char_t *src, + unsigned long maxlength); + #endif /* _LINUX_UCS2_STRING_H_ */ -- cgit v1.2.3 From 8282f5d9c17fe15a9e658c06e3f343efae1a2a2f Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Mon, 8 Feb 2016 14:48:14 -0500 Subject: efi: Make our variable validation list include the guid All the variables in this list so far are defined to be in the global namespace in the UEFI spec, so this just further ensures we're validating the variables we think we are. Including the guid for entries will become more important in future patches when we decide whether or not to allow deletion of variables based on presence in this list. Signed-off-by: Peter Jones Tested-by: Lee, Chun-Yi Acked-by: Matthew Garrett Signed-off-by: Matt Fleming --- include/linux/efi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 569b5a866bb1..16ca611aabc8 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1199,7 +1199,8 @@ int efivar_entry_iter(int (*func)(struct efivar_entry *, void *), struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid, struct list_head *head, bool remove); -bool efivar_validate(efi_char16_t *var_name, u8 *data, unsigned long len); +bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data, + unsigned long data_size); extern struct work_struct efivar_work; void efivar_run_worker(void); -- cgit v1.2.3 From ed8b0de5a33d2a2557dce7f9429dca8cb5bc5879 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Mon, 8 Feb 2016 14:48:15 -0500 Subject: efi: Make efivarfs entries immutable by default "rm -rf" is bricking some peoples' laptops because of variables being used to store non-reinitializable firmware driver data that's required to POST the hardware. These are 100% bugs, and they need to be fixed, but in the mean time it shouldn't be easy to *accidentally* brick machines. We have to have delete working, and picking which variables do and don't work for deletion is quite intractable, so instead make everything immutable by default (except for a whitelist), and make tools that aren't quite so broad-spectrum unset the immutable flag. Signed-off-by: Peter Jones Tested-by: Lee, Chun-Yi Acked-by: Matthew Garrett Signed-off-by: Matt Fleming --- include/linux/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 16ca611aabc8..47be3ad7d3e5 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1201,6 +1201,8 @@ struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid, bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data, unsigned long data_size); +bool efivar_variable_is_removable(efi_guid_t vendor, const char *name, + size_t len); extern struct work_struct efivar_work; void efivar_run_worker(void); -- cgit v1.2.3 From 9095adaab8c1d82707e4e9961b6ad79b62f3361b Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Wed, 10 Feb 2016 18:59:13 -0500 Subject: target/transport: add flag to indicate CPU Affinity is observed Signed-off-by: Quinn Tran Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Fixes: fb3269b ("qla2xxx: Add selective command queuing") Signed-off-by: Himanshu Madhani Signed-off-by: Nicholas Bellinger --- include/target/target_core_base.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index d71a3ead9e63..e8c8c08bf575 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -141,6 +141,7 @@ enum se_cmd_flags_table { SCF_COMPARE_AND_WRITE_POST = 0x00100000, SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC = 0x00200000, SCF_ACK_KREF = 0x00400000, + SCF_USE_CPUID = 0x00800000, }; /* struct se_dev_entry->lun_flags and struct se_lun->lun_access */ @@ -188,6 +189,7 @@ enum target_sc_flags_table { TARGET_SCF_BIDI_OP = 0x01, TARGET_SCF_ACK_KREF = 0x02, TARGET_SCF_UNKNOWN_SIZE = 0x04, + TARGET_SCF_USE_CPUID = 0x08, }; /* fabric independent task management function values */ -- cgit v1.2.3 From 4a389810bc3cb0e73443104f0827e81e23cb1e12 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Feb 2016 16:13:14 -0800 Subject: kernel/locking/lockdep.c: convert hash tables to hlists Mike said: : CONFIG_UBSAN_ALIGNMENT breaks x86-64 kernel with lockdep enabled, i. e : kernel with CONFIG_UBSAN_ALIGNMENT fails to load without even any error : message. : : The problem is that ubsan callbacks use spinlocks and might be called : before lockdep is initialized. Particularly this line in the : reserve_ebda_region function causes problem: : : lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); : : If i put lockdep_init() before reserve_ebda_region call in : x86_64_start_reservations kernel loads well. Fix this ordering issue permanently: change lockdep so that it uses hlists for the hash tables. Unlike a list_head, an hlist_head is in its initialized state when it is all-zeroes, so lockdep is ready for operation immediately upon boot - lockdep_init() need not have run. The patch will also save some memory. lockdep_init() and lockdep_initialized can be done away with now - a 4.6 patch has been prepared to do this. Reported-by: Mike Krinkin Suggested-by: Mike Krinkin Cc: Andrey Ryabinin Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index c57e424d914b..4dca42fd32f5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -66,7 +66,7 @@ struct lock_class { /* * class-hash: */ - struct list_head hash_entry; + struct hlist_node hash_entry; /* * global list of all lock-classes: @@ -199,7 +199,7 @@ struct lock_chain { u8 irq_context; u8 depth; u16 base; - struct list_head entry; + struct hlist_node entry; u64 chain_key; }; -- cgit v1.2.3 From db78c22230d0bcc8b27b81f05b39f104f08232c5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 11 Feb 2016 16:13:17 -0800 Subject: mm: fix pfn_t vs highmem The pfn_t type uses an unsigned long to store a pfn + flags value. On a 64-bit platform the upper 12 bits of an unsigned long are never used for storing the value of a pfn. However, this is not true on highmem platforms, all 32-bits of a pfn value are used to address a 44-bit physical address space. A pfn_t needs to store a 64-bit value. Link: https://bugzilla.kernel.org/show_bug.cgi?id=112211 Fixes: 01c8f1c44b83 ("mm, dax, gpu: convert vm_insert_mixed to pfn_t") Signed-off-by: Dan Williams Reported-by: Stuart Foster Reported-by: Julian Margetson Tested-by: Julian Margetson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn.h | 2 +- include/linux/pfn_t.h | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/pfn.h b/include/linux/pfn.h index 2d8e49711b63..1132953235c0 100644 --- a/include/linux/pfn.h +++ b/include/linux/pfn.h @@ -10,7 +10,7 @@ * backing is indicated by flags in the high bits of the value. */ typedef struct { - unsigned long val; + u64 val; } pfn_t; #endif diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 37448ab5fb5c..94994810c7c0 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -9,14 +9,13 @@ * PFN_DEV - pfn is not covered by system memmap by default * PFN_MAP - pfn has a dynamic page mapping established by a device driver */ -#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \ - << (BITS_PER_LONG - PAGE_SHIFT)) -#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1)) -#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2)) -#define PFN_DEV (1UL << (BITS_PER_LONG - 3)) -#define PFN_MAP (1UL << (BITS_PER_LONG - 4)) - -static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags) +#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) +#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) +#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2)) +#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) +#define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4)) + +static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags) { pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), }; @@ -29,7 +28,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags); +extern pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags); static inline bool pfn_t_has_page(pfn_t pfn) { @@ -87,7 +86,7 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot) #ifdef __HAVE_ARCH_PTE_DEVMAP static inline bool pfn_t_devmap(pfn_t pfn) { - const unsigned long flags = PFN_DEV|PFN_MAP; + const u64 flags = PFN_DEV|PFN_MAP; return (pfn.val & flags) == flags; } -- cgit v1.2.3