From 75dd602a5198a6e5f75534db52b6e6fbaabb33d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Mar 2016 11:36:59 +0200 Subject: lockdep: Fix lock_chain::base size lock_chain::base is used to store an index into the chain_hlocks[] array, however that array contains more elements than can be indexed using the u16. Change the lock_chain structure to use a bitfield to encode the data it needs and add BUILD_BUG_ON() assertions to check the fields are wide enough. Also, for DEBUG_LOCKDEP, assert that we don't run out of elements of that array; as that would wreck the collision detectoring. Signed-off-by: Peter Zijlstra (Intel) Cc: Alfredo Alvarez Fernandez Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sedat Dilek Cc: Theodore Ts'o Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160330093659.GS3408@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index d026b190c530..d10ef06971b5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -196,9 +196,11 @@ struct lock_list { * We record lock dependency chains, so that we can cache them: */ struct lock_chain { - u8 irq_context; - u8 depth; - u16 base; + /* see BUILD_BUG_ON()s in lookup_chain_cache() */ + unsigned int irq_context : 2, + depth : 6, + base : 24; + /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; -- cgit v1.2.3 From 046339eaab26804f52f6604877f5674f70815b26 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 22 Apr 2016 00:33:03 +0300 Subject: net/mlx5e: Device's mtu field is u16 and not int For set/query MTU port firmware commands the MTU field is 16 bits, here I changed all the "int mtu" parameters of the functions wrapping those firmware commands to be u16. Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/port.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index a1d145abd4eb..b30250ab7604 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -54,9 +54,9 @@ int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, enum mlx5_port_status *status); -int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port); -void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port); -void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, +int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port); +void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu, u8 port); +void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu, u8 port); int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, -- cgit v1.2.3 From cd255efff9baadd654d6160e52d17ae7c568c9d3 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 22 Apr 2016 00:33:05 +0300 Subject: net/mlx5e: Use vport MTU rather than physical port MTU Set and report vport MTU rather than physical MTU, Driver will set both vport and physical port mtu and will rely on the query of vport mtu. SRIOV VFs have to report their MTU to their vport manager (PF), and this will allow them to work with any MTU they need without failing the request. Also for some cases where the PF is not a port owner, PF can work with MTU less than the physical port mtu if set physical port mtu didn't take effect. Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/vport.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index bd93e6323603..301da4a5e6bf 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -45,6 +45,8 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr); int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, u16 vport, u8 *addr); +int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); +int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); -- cgit v1.2.3 From 5fc7197d3a256d9c5de3134870304b24892a4908 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Fri, 22 Apr 2016 00:33:07 +0300 Subject: net/mlx5: Add pci shutdown callback This patch introduces kexec support for mlx5. When switching kernels, kexec() calls shutdown, which unloads the driver and cleans its resources. In addition, remove unregister netdev from shutdown flow. This will allow a clean shutdown, even if some netdev clients did not release their reference from this netdev. Releasing The HW resources only is enough as the kernel is shutting down Signed-off-by: Majd Dibbiny Signed-off-by: Tariq Toukan Signed-off-by: Haggai Abramovsky Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index dcd5ac8d3b14..369c837d40f5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -519,8 +519,9 @@ enum mlx5_device_state { }; enum mlx5_interface_state { - MLX5_INTERFACE_STATE_DOWN, - MLX5_INTERFACE_STATE_UP, + MLX5_INTERFACE_STATE_DOWN = BIT(0), + MLX5_INTERFACE_STATE_UP = BIT(1), + MLX5_INTERFACE_STATE_SHUTDOWN = BIT(2), }; enum mlx5_pci_status { @@ -544,7 +545,7 @@ struct mlx5_core_dev { enum mlx5_device_state state; /* sync interface state */ struct mutex intf_state_mutex; - enum mlx5_interface_state interface_state; + unsigned long intf_state; void (*event) (struct mlx5_core_dev *dev, enum mlx5_dev_event event, unsigned long param); -- cgit v1.2.3 From 6c1ea260f89709e0021d2c59f8fd2a104b5b1123 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 11 Apr 2016 19:34:49 +0200 Subject: libceph: make authorizer destruction independent of ceph_auth_client Starting the kernel client with cephx disabled and then enabling cephx and restarting userspace daemons can result in a crash: [262671.478162] BUG: unable to handle kernel paging request at ffffebe000000000 [262671.531460] IP: [] kfree+0x5a/0x130 [262671.584334] PGD 0 [262671.635847] Oops: 0000 [#1] SMP [262672.055841] CPU: 22 PID: 2961272 Comm: kworker/22:2 Not tainted 4.2.0-34-generic #39~14.04.1-Ubuntu [262672.162338] Hardware name: Dell Inc. PowerEdge R720/068CDY, BIOS 2.4.3 07/09/2014 [262672.268937] Workqueue: ceph-msgr con_work [libceph] [262672.322290] task: ffff88081c2d0dc0 ti: ffff880149ae8000 task.ti: ffff880149ae8000 [262672.428330] RIP: 0010:[] [] kfree+0x5a/0x130 [262672.535880] RSP: 0018:ffff880149aeba58 EFLAGS: 00010286 [262672.589486] RAX: 000001e000000000 RBX: 0000000000000012 RCX: ffff8807e7461018 [262672.695980] RDX: 000077ff80000000 RSI: ffff88081af2be04 RDI: 0000000000000012 [262672.803668] RBP: ffff880149aeba78 R08: 0000000000000000 R09: 0000000000000000 [262672.912299] R10: ffffebe000000000 R11: ffff880819a60e78 R12: ffff8800aec8df40 [262673.021769] R13: ffffffffc035f70f R14: ffff8807e5b138e0 R15: ffff880da9785840 [262673.131722] FS: 0000000000000000(0000) GS:ffff88081fac0000(0000) knlGS:0000000000000000 [262673.245377] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [262673.303281] CR2: ffffebe000000000 CR3: 0000000001c0d000 CR4: 00000000001406e0 [262673.417556] Stack: [262673.472943] ffff880149aeba88 ffff88081af2be04 ffff8800aec8df40 ffff88081af2be04 [262673.583767] ffff880149aeba98 ffffffffc035f70f ffff880149aebac8 ffff8800aec8df00 [262673.694546] ffff880149aebac8 ffffffffc035c89e ffff8807e5b138e0 ffff8805b047f800 [262673.805230] Call Trace: [262673.859116] [] ceph_x_destroy_authorizer+0x1f/0x50 [libceph] [262673.968705] [] ceph_auth_destroy_authorizer+0x3e/0x60 [libceph] [262674.078852] [] put_osd+0x45/0x80 [libceph] [262674.134249] [] remove_osd+0xae/0x140 [libceph] [262674.189124] [] __reset_osd+0x103/0x150 [libceph] [262674.243749] [] kick_requests+0x223/0x460 [libceph] [262674.297485] [] ceph_osdc_handle_map+0x282/0x5e0 [libceph] [262674.350813] [] dispatch+0x4e/0x720 [libceph] [262674.403312] [] try_read+0x3d1/0x1090 [libceph] [262674.454712] [] ? dequeue_entity+0x152/0x690 [262674.505096] [] con_work+0xcb/0x1300 [libceph] [262674.555104] [] process_one_work+0x14e/0x3d0 [262674.604072] [] worker_thread+0x11a/0x470 [262674.652187] [] ? rescuer_thread+0x310/0x310 [262674.699022] [] kthread+0xd2/0xf0 [262674.744494] [] ? kthread_create_on_node+0x1c0/0x1c0 [262674.789543] [] ret_from_fork+0x3f/0x70 [262674.834094] [] ? kthread_create_on_node+0x1c0/0x1c0 What happens is the following: (1) new MON session is established (2) old "none" ac is destroyed (3) new "cephx" ac is constructed ... (4) old OSD session (w/ "none" authorizer) is put ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer) osd->o_auth.authorizer in the "none" case is just a bare pointer into ac, which contains a single static copy for all services. By the time we get to (4), "none" ac, freed in (2), is long gone. On top of that, a new vtable installed in (3) points us at ceph_x_destroy_authorizer(), so we end up trying to destroy a "none" authorizer with a "cephx" destructor operating on invalid memory! To fix this, decouple authorizer destruction from ac and do away with a single static "none" authorizer by making a copy for each OSD or MDS session. Authorizers themselves are independent of ac and so there is no reason for destroy_authorizer() to be an ac op. Make it an op on the authorizer itself by turning ceph_authorizer into a real struct. Fixes: http://tracker.ceph.com/issues/15447 Reported-by: Alan Zhang Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/auth.h | 10 +++++----- include/linux/ceph/osd_client.h | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index 260d78b587c4..1563265d2097 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -12,9 +12,12 @@ */ struct ceph_auth_client; -struct ceph_authorizer; struct ceph_msg; +struct ceph_authorizer { + void (*destroy)(struct ceph_authorizer *); +}; + struct ceph_auth_handshake { struct ceph_authorizer *authorizer; void *authorizer_buf; @@ -62,8 +65,6 @@ struct ceph_auth_client_ops { struct ceph_auth_handshake *auth); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len); - void (*destroy_authorizer)(struct ceph_auth_client *ac, - struct ceph_authorizer *a); void (*invalidate_authorizer)(struct ceph_auth_client *ac, int peer_type); @@ -112,8 +113,7 @@ extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth); -extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a); +void ceph_auth_destroy_authorizer(struct ceph_authorizer *a); extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *a); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4343df806710..cbf460927c42 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -16,7 +16,6 @@ struct ceph_msg; struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; -struct ceph_authorizer; /* * completion callback for async writepages -- cgit v1.2.3 From 841645b5f2dfceac69b78fcd0c9050868d41ea61 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 25 Apr 2016 15:33:55 -0400 Subject: ipv6: Revert optional address flusing on ifdown. This reverts the following three commits: 70af921db6f8835f4b11c65731116560adb00c14 799977d9aafbf0ca0b9c39b04cbfb16db71302c9 f1705ec197e705b79ea40fe7a2cc5acfa1d3bfac The feature was ill conceived, has terrible semantics, and has added nothing but regressions to the already fragile ipv6 stack. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 7edc14fb66b6..4b2267e1b7c3 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -62,7 +62,6 @@ struct ipv6_devconf { struct in6_addr secret; } stable_secret; __s32 use_oif_addrs_only; - __s32 keep_addr_on_down; void *sysctl; }; -- cgit v1.2.3 From 5cf1cacb49aee39c3e02ae87068fc3c6430659b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2016 19:06:48 -0400 Subject: cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback Since e93ad19d0564 ("cpuset: make mm migration asynchronous"), cpuset kicks off asynchronous NUMA node migration if necessary during task migration and flushes it from cpuset_post_attach_flush() which is called at the end of __cgroup_procs_write(). This is to avoid performing migration with cgroup_threadgroup_rwsem write-locked which can lead to deadlock through dependency on kworker creation. memcg has a similar issue with charge moving, so let's convert it to an official callback rather than the current one-off cpuset specific function. This patch adds cgroup_subsys->post_attach callback and makes cpuset register cpuset_post_attach_flush() as its ->post_attach. The conversion is mostly one-to-one except that the new callback is called under cgroup_mutex. This is to guarantee that no other migration operations are started before ->post_attach callbacks are finished. cgroup_mutex is one of the outermost mutex in the system and has never been and shouldn't be a problem. We can add specialized synchronization around __cgroup_procs_write() but I don't think there's any noticeable benefit. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko Cc: # 4.4+ prerequisite for the next patch --- include/linux/cgroup-defs.h | 1 + include/linux/cpuset.h | 6 ------ 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 3e39ae5bc799..5b17de62c962 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -444,6 +444,7 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); + void (*post_attach)(void); int (*can_fork)(struct task_struct *task); void (*cancel_fork)(struct task_struct *task); void (*fork)(struct task_struct *task); diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index fea160ee5803..85a868ccb493 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -137,8 +137,6 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } -extern void cpuset_post_attach_flush(void); - #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -245,10 +243,6 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } -static inline void cpuset_post_attach_flush(void) -{ -} - #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ -- cgit v1.2.3 From 6a923934c33c750a595868af6bef5f1a1fa90054 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 26 Apr 2016 11:47:41 -0400 Subject: Revert "ipv6: Revert optional address flusing on ifdown." This reverts commit 841645b5f2dfceac69b78fcd0c9050868d41ea61. Ok, this puts the feature back. I've decided to apply David A.'s bug fix and run with that rather than make everyone wait another whole release for this feature. Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 4b2267e1b7c3..7edc14fb66b6 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -62,6 +62,7 @@ struct ipv6_devconf { struct in6_addr secret; } stable_secret; __s32 use_oif_addrs_only; + __s32 keep_addr_on_down; void *sysctl; }; -- cgit v1.2.3 From 8ead9dd54716d1e05e129959f702fcc1786f82b4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 25 Apr 2016 20:04:08 -0700 Subject: devpts: more pty driver interface cleanups This is more prep-work for the upcoming pty changes. Still just code cleanup with no actual semantic changes. This removes a bunch pointless complexity by just having the slave pty side remember the dentry associated with the devpts slave rather than the inode. That allows us to remove all the "look up the dentry" code for when we want to remove it again. Together with moving the tty pointer from "inode->i_private" to "dentry->d_fsdata" and getting rid of pointless inode locking, this removes about 30 lines of code. Not only is the end result smaller, it's simpler and easier to understand. The old code, for example, depended on the d_find_alias() to not just find the dentry, but also to check that it is still hashed, which in turn validated the tty pointer in the inode. That is a _very_ roundabout way to say "invalidate the cached tty pointer when the dentry is removed". The new code just does dentry->d_fsdata = NULL; in devpts_pty_kill() instead, invalidating the tty pointer rather more directly and obviously. Don't do something complex and subtle when the obvious straightforward approach will do. The rest of the patch (ie apart from code deletion and the above tty pointer clearing) is just switching the calling convention to pass the dentry or file pointer around instead of the inode. Cc: Eric Biederman Cc: Peter Anvin Cc: Andy Lutomirski Cc: Al Viro Cc: Peter Hurley Cc: Serge Hallyn Cc: Willy Tarreau Cc: Aurelien Jarno Cc: Alan Cox Cc: Jann Horn Cc: Greg KH Cc: Jiri Slaby Cc: Florian Weimer Signed-off-by: Linus Torvalds --- include/linux/devpts_fs.h | 6 +++--- include/linux/tty_driver.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 358a4db72a27..5871f292b596 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -27,11 +27,11 @@ int devpts_new_index(struct pts_fs_info *); void devpts_kill_index(struct pts_fs_info *, int); /* mknod in devpts */ -struct inode *devpts_pty_new(struct pts_fs_info *, dev_t, int, void *); +struct dentry *devpts_pty_new(struct pts_fs_info *, int, void *); /* get private structure */ -void *devpts_get_priv(struct inode *pts_inode); +void *devpts_get_priv(struct dentry *); /* unlink */ -void devpts_pty_kill(struct inode *inode); +void devpts_pty_kill(struct dentry *); #endif diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 161052477f77..b742b5e47cc2 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -7,7 +7,7 @@ * defined; unless noted otherwise, they are optional, and can be * filled in with a null pointer. * - * struct tty_struct * (*lookup)(struct tty_driver *self, int idx) + * struct tty_struct * (*lookup)(struct tty_driver *self, struct file *, int idx) * * Return the tty device corresponding to idx, NULL if there is not * one currently in use and an ERR_PTR value on error. Called under @@ -250,7 +250,7 @@ struct serial_icounter_struct; struct tty_operations { struct tty_struct * (*lookup)(struct tty_driver *driver, - struct inode *inode, int idx); + struct file *filp, int idx); int (*install)(struct tty_driver *driver, struct tty_struct *tty); void (*remove)(struct tty_driver *driver, struct tty_struct *tty); int (*open)(struct tty_struct * tty, struct file * filp); -- cgit v1.2.3 From 0224a4a30b57385a60065aa598181868881d8fc6 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 27 Apr 2016 14:04:20 +0300 Subject: device property: Avoid potential dereferences of invalid pointers Since fwnode may hold ERR_PTR(-ENODEV) or it may be NULL, the fwnode type checks is_of_node(), is_acpi_node() and is is_pset_node() need to consider it. Using IS_ERR_OR_NULL() to check it. Fixes: 0d67e0fa1664 (device property: fix for a case of use-after-free) Reported-by: Dan Carpenter Signed-off-by: Heikki Krogerus [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 7fcb681baadf..31758036787c 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -133,7 +133,7 @@ void of_core_init(void); static inline bool is_of_node(struct fwnode_handle *fwnode) { - return fwnode && fwnode->type == FWNODE_OF; + return !IS_ERR_OR_NULL(fwnode) && fwnode->type == FWNODE_OF; } static inline struct device_node *to_of_node(struct fwnode_handle *fwnode) -- cgit v1.2.3 From 986ef95ecdd3eb6fa29433e68faa94c7624083be Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 31 Mar 2016 19:03:25 +0300 Subject: IB/mlx5: Expose correct max_sge_rd limit mlx5 devices (Connect-IB, ConnectX-4, ConnectX-4-LX) has a limitation where rdma read work queue entries cannot exceed 512 bytes. A rdma_read wqe needs to fit in 512 bytes: - wqe control segment (16 bytes) - rdma segment (16 bytes) - scatter elements (16 bytes each) So max_sge_rd should be: (512 - 16 - 16) / 16 = 30. Cc: linux-stable@vger.kernel.org Reported-by: Christoph Hellwig Tested-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/linux/mlx5/device.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 8156e3c9239c..b3575f392492 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -392,6 +392,17 @@ enum { MLX5_CAP_OFF_CMDIF_CSUM = 46, }; +enum { + /* + * Max wqe size for rdma read is 512 bytes, so this + * limits our max_sge_rd as the wqe needs to fit: + * - ctrl segment (16 bytes) + * - rdma segment (16 bytes) + * - scatter elements (16 bytes each) + */ + MLX5_MAX_SGE_RD = (512 - 16 - 16) / 16 +}; + struct mlx5_inbox_hdr { __be16 opcode; u8 rsvd[4]; -- cgit v1.2.3 From 7b7483409f09c15f30ac43242ead1ab3061e1f59 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 25 Apr 2016 15:13:17 -0300 Subject: net: fix net_gso_ok for new GSO types. Fix casting in net_gso_ok. Otherwise the shift on gso_type << NETIF_F_GSO_SHIFT may hit the 32th bit and make it look like a INT_MIN, which is then promoted from signed to uint64 which is 0xffffffff80000000, resulting in wrong behavior when it is and'ed with the feature itself, as in: This test app: #include #include int main(int argc, char **argv) { uint64_t feature1; uint64_t feature2; int gso_type = 1 << 15; feature1 = gso_type << 16; feature2 = (uint64_t)gso_type << 16; printf("%lx %lx\n", feature1, feature2); return 0; } Gives: ffffffff80000000 80000000 So that this: return (features & feature) == feature; Actually works on more bits than expected and invalid ones. Fix is to promote it earlier. Issue noted while rebasing SCTP GSO patch but posting separetely as someone else may experience this meanwhile. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8395308a2445..b3c46b019ac1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4004,7 +4004,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { - netdev_features_t feature = gso_type << NETIF_F_GSO_SHIFT; + netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT; /* check flags correspondence */ BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); -- cgit v1.2.3 From 92117d8443bc5afacc8d5ba82e541946310f106e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 27 Apr 2016 18:56:20 -0700 Subject: bpf: fix refcnt overflow On a system with >32Gbyte of phyiscal memory and infinite RLIMIT_MEMLOCK, the malicious application may overflow 32-bit bpf program refcnt. It's also possible to overflow map refcnt on 1Tb system. Impose 32k hard limit which means that the same bpf program or map cannot be shared by more than 32k processes. Fixes: 1be7f75d1668 ("bpf: enable non-root eBPF programs") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 21ee41b92e8a..f1d5c5acc8dd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -171,12 +171,13 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl); void bpf_register_map_type(struct bpf_map_type_list *tl); struct bpf_prog *bpf_prog_get(u32 ufd); +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_put_rcu(struct bpf_prog *prog); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); -void bpf_map_inc(struct bpf_map *map, bool uref); +struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); -- cgit v1.2.3 From 66ee95d16a7f1b7b4f1dd74a2d81c6e19dc29a14 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Thu, 28 Apr 2016 16:18:24 -0700 Subject: mm: exclude HugeTLB pages from THP page_mapped() logic HugeTLB pages cannot be split, so we use the compound_mapcount to track rmaps. Currently page_mapped() will check the compound_mapcount, but will also go through the constituent pages of a THP compound page and query the individual _mapcount's too. Unfortunately, page_mapped() does not distinguish between HugeTLB and THP compound pages and assumes that a compound page always needs to have HPAGE_PMD_NR pages querying. For most cases when dealing with HugeTLB this is just inefficient, but for scenarios where the HugeTLB page size is less than the pmd block size (e.g. when using contiguous bit on ARM) this can lead to crashes. This patch adjusts the page_mapped function such that we skip the unnecessary THP reference checks for HugeTLB pages. Fixes: e1534ae95004 ("mm: differentiate page_mapped() from page_mapcount() for compound pages") Signed-off-by: Steve Capper Acked-by: Kirill A. Shutemov Cc: Will Deacon Cc: Catalin Marinas Cc: Michal Hocko Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a55e5be0894f..79b6c18d0a38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1031,6 +1031,8 @@ static inline bool page_mapped(struct page *page) page = compound_head(page); if (atomic_read(compound_mapcount_ptr(page)) >= 0) return true; + if (PageHuge(page)) + return false; for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; -- cgit v1.2.3 From aa88b68c3b1dce8bc3fd54c8a7372a777ff265cd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 28 Apr 2016 16:18:27 -0700 Subject: thp: keep huge zero page pinned until tlb flush Andrea has found[1] a race condition on MMU-gather based TLB flush vs split_huge_page() or shrinker which frees huge zero under us (patch 1/2 and 2/2 respectively). With new THP refcounting, we don't need patch 1/2: mmu_gather keeps the page pinned until flush is complete and the pin prevents the page from being split under us. We still need patch 2/2. This is simplified version of Andrea's patch. We don't need fancy encoding. [1] http://lkml.kernel.org/r/1447938052-22165-1-git-send-email-aarcange@redhat.com Signed-off-by: Kirill A. Shutemov Reported-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Mel Gorman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Dave Hansen Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7008623e24b1..d7b9e5346fba 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -152,6 +152,7 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) } struct page *get_huge_zero_page(void); +void put_huge_zero_page(void); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) @@ -208,6 +209,10 @@ static inline bool is_huge_zero_page(struct page *page) return false; } +static inline void put_huge_zero_page(void) +{ + BUILD_BUG(); +} static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags) -- cgit v1.2.3 From 28093f9f34cedeaea0f481c58446d9dac6dd620f Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 28 Apr 2016 16:18:35 -0700 Subject: numa: fix /proc//numa_maps for THP In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the layouts may differ depending on the architecture. On s390 this will lead to inaccurate numa_maps accounting in /proc because of misguided pte_present() and pte_dirty() checks on the fake pte. On other architectures pte_present() and pte_dirty() may work by chance, but there may be an issue with direct-access (dax) mappings w/o underlying struct pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page() the fake pte will be checked with pte_special() and because there is no "special" bit in a pmd, this will always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an invalid struct page pointer would then be returned that can crash the kernel. This patch fixes the numa_maps THP handling by introducing new "_pmd" variants of the can_gather_numa_stats() and vm_normal_page() functions. Signed-off-by: Gerald Schaefer Cc: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Konstantin Khlebnikov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Johannes Weiner Cc: Dave Hansen Cc: Mel Gorman Cc: Dan Williams Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Michael Holzheu Cc: [4.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 79b6c18d0a38..864d7221de84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1140,6 +1140,8 @@ struct zap_details { struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd); int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -- cgit v1.2.3 From 2c94b53738549d81dc7464a32117d1f5112c64d3 Mon Sep 17 00:00:00 2001 From: Tim Bingham Date: Fri, 29 Apr 2016 13:30:23 -0400 Subject: net: Implement net_dbg_ratelimited() for CONFIG_DYNAMIC_DEBUG case Prior to commit d92cff89a0c8 ("net_dbg_ratelimited: turn into no-op when !DEBUG") the implementation of net_dbg_ratelimited() was buggy for both the DEBUG and CONFIG_DYNAMIC_DEBUG cases. The bug was that net_ratelimit() was being called and, despite returning true, nothing was being printed to the console. This resulted in messages like the following - "net_ratelimit: %d callbacks suppressed" with no other output nearby. After commit d92cff89a0c8 ("net_dbg_ratelimited: turn into no-op when !DEBUG") the bug is fixed for the DEBUG case. However, there's no output at all for CONFIG_DYNAMIC_DEBUG case. This patch restores debug output (if enabled) for the CONFIG_DYNAMIC_DEBUG case. Add a definition of net_dbg_ratelimited() for the CONFIG_DYNAMIC_DEBUG case. The implementation takes care to check that dynamic debugging is enabled before calling net_ratelimit(). Fixes: d92cff89a0c8 ("net_dbg_ratelimited: turn into no-op when !DEBUG") Signed-off-by: Tim Bingham Signed-off-by: David S. Miller --- include/linux/net.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 49175e4ced11..f840d77c6c31 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -246,7 +246,15 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#if defined(DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) +#define net_dbg_ratelimited(fmt, ...) \ +do { \ + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ + if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \ + net_ratelimit()) \ + __dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__); \ +} while (0) +#elif defined(DEBUG) #define net_dbg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) #else -- cgit v1.2.3 From 689de1d6ca95b3b5bd8ee446863bf81a4883ea25 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 2 May 2016 12:46:42 -0700 Subject: Minimal fix-up of bad hashing behavior of hash_64() This is a fairly minimal fixup to the horribly bad behavior of hash_64() with certain input patterns. In particular, because the multiplicative value used for the 64-bit hash was intentionally bit-sparse (so that the multiply could be done with shifts and adds on architectures without hardware multipliers), some bits did not get spread out very much. In particular, certain fairly common bit ranges in the input (roughly bits 12-20: commonly with the most information in them when you hash things like byte offsets in files or memory that have block factors that mean that the low bits are often zero) would not necessarily show up much in the result. There's a bigger patch-series brewing to fix up things more completely, but this is the fairly minimal fix for the 64-bit hashing problem. It simply picks a much better constant multiplier, spreading the bits out a lot better. NOTE! For 32-bit architectures, the bad old hash_64() remains the same for now, since 64-bit multiplies are expensive. The bigger hashing cleanup will replace the 32-bit case with something better. The new constants were picked by George Spelvin who wrote that bigger cleanup series. I just picked out the constants and part of the comment from that series. Cc: stable@vger.kernel.org Cc: George Spelvin Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- include/linux/hash.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hash.h b/include/linux/hash.h index 1afde47e1528..79c52fa81cac 100644 --- a/include/linux/hash.h +++ b/include/linux/hash.h @@ -32,12 +32,28 @@ #error Wordsize not 32 or 64 #endif +/* + * The above primes are actively bad for hashing, since they are + * too sparse. The 32-bit one is mostly ok, the 64-bit one causes + * real problems. Besides, the "prime" part is pointless for the + * multiplicative hash. + * + * Although a random odd number will do, it turns out that the golden + * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice + * properties. + * + * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2. + * (See Knuth vol 3, section 6.4, exercise 9.) + */ +#define GOLDEN_RATIO_32 0x61C88647 +#define GOLDEN_RATIO_64 0x61C8864680B583EBull + static __always_inline u64 hash_64(u64 val, unsigned int bits) { u64 hash = val; -#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 - hash = hash * GOLDEN_RATIO_PRIME_64; +#if BITS_PER_LONG == 64 + hash = hash * GOLDEN_RATIO_64; #else /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ u64 n = hash; -- cgit v1.2.3 From af67eb9e7e1ab37880459f83153d34b3c42b0075 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 2 May 2016 09:25:16 -0700 Subject: vxlan: Add checksum check to the features check function We need to perform an additional check on the inner headers to determine if we can offload the checksum for them. Previously this check didn't occur so we would generate an invalid frame in the case of an IPv6 header encapsulated inside of an IPv4 tunnel. To fix this I added a secondary check to vxlan_features_check so that we can verify that we can offload the inner checksum. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/if_ether.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index d5569734f672..548fd535fd02 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -28,6 +28,11 @@ static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) return (struct ethhdr *)skb_mac_header(skb); } +static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) +{ + return (struct ethhdr *)skb_inner_mac_header(skb); +} + int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); -- cgit v1.2.3 From 4550c4e157ca3da929593bb6c64080a59141af35 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 5 May 2016 16:22:03 -0700 Subject: mm: memcontrol: let v2 cgroups follow changes in system swappiness Cgroup2 currently doesn't have a per-cgroup swappiness setting. We might want to add one later - that's a different discussion - but until we do, the cgroups should always follow the system setting. Otherwise it will be unchangeably set to whatever the ancestor inherited from the system setting at the time of cgroup creation. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Cc: [4.5] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2b83359c19ca..0a4cd4703f40 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -533,6 +533,10 @@ static inline swp_entry_t get_swap_page(void) #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { + /* Cgroup2 doesn't have per-cgroup swappiness */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return vm_swappiness; + /* root ? */ if (mem_cgroup_disabled() || !memcg->css.parent) return vm_swappiness; -- cgit v1.2.3 From 4e1016dac1ccce6d8a960775526cdc3a5baa690b Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Thu, 5 May 2016 16:22:06 -0700 Subject: rapidio/mport_cdev: fix uapi type definitions Fix problems in uapi definitions reported by Gabriel Laskar: (see https://lkml.org/lkml/2016/4/5/205 for details) - move public header file rio_mport_cdev.h to include/uapi/linux directory - change types in data structures passed as IOCTL parameters - improve parameter checking in some IOCTL service routines Signed-off-by: Alexandre Bounine Reported-by: Gabriel Laskar Tested-by: Barry Wood Cc: Gabriel Laskar Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Cc: Barry Wood Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rio_mport_cdev.h | 271 ----------------------------------------- 1 file changed, 271 deletions(-) delete mode 100644 include/linux/rio_mport_cdev.h (limited to 'include/linux') diff --git a/include/linux/rio_mport_cdev.h b/include/linux/rio_mport_cdev.h deleted file mode 100644 index b65d19df76d2..000000000000 --- a/include/linux/rio_mport_cdev.h +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (c) 2015-2016, Integrated Device Technology Inc. - * Copyright (c) 2015, Prodrive Technologies - * Copyright (c) 2015, Texas Instruments Incorporated - * Copyright (c) 2015, RapidIO Trade Association - * All rights reserved. - * - * This software is available to you under a choice of one of two licenses. - * You may choose to be licensed under the terms of the GNU General Public - * License(GPL) Version 2, or the BSD-3 Clause license below: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _RIO_MPORT_CDEV_H_ -#define _RIO_MPORT_CDEV_H_ - -#ifndef __user -#define __user -#endif - -struct rio_mport_maint_io { - uint32_t rioid; /* destID of remote device */ - uint32_t hopcount; /* hopcount to remote device */ - uint32_t offset; /* offset in register space */ - size_t length; /* length in bytes */ - void __user *buffer; /* data buffer */ -}; - -/* - * Definitions for RapidIO data transfers: - * - memory mapped (MAPPED) - * - packet generation from memory (TRANSFER) - */ -#define RIO_TRANSFER_MODE_MAPPED (1 << 0) -#define RIO_TRANSFER_MODE_TRANSFER (1 << 1) -#define RIO_CAP_DBL_SEND (1 << 2) -#define RIO_CAP_DBL_RECV (1 << 3) -#define RIO_CAP_PW_SEND (1 << 4) -#define RIO_CAP_PW_RECV (1 << 5) -#define RIO_CAP_MAP_OUTB (1 << 6) -#define RIO_CAP_MAP_INB (1 << 7) - -struct rio_mport_properties { - uint16_t hdid; - uint8_t id; /* Physical port ID */ - uint8_t index; - uint32_t flags; - uint32_t sys_size; /* Default addressing size */ - uint8_t port_ok; - uint8_t link_speed; - uint8_t link_width; - uint32_t dma_max_sge; - uint32_t dma_max_size; - uint32_t dma_align; - uint32_t transfer_mode; /* Default transfer mode */ - uint32_t cap_sys_size; /* Capable system sizes */ - uint32_t cap_addr_size; /* Capable addressing sizes */ - uint32_t cap_transfer_mode; /* Capable transfer modes */ - uint32_t cap_mport; /* Mport capabilities */ -}; - -/* - * Definitions for RapidIO events; - * - incoming port-writes - * - incoming doorbells - */ -#define RIO_DOORBELL (1 << 0) -#define RIO_PORTWRITE (1 << 1) - -struct rio_doorbell { - uint32_t rioid; - uint16_t payload; -}; - -struct rio_doorbell_filter { - uint32_t rioid; /* 0xffffffff to match all ids */ - uint16_t low; - uint16_t high; -}; - - -struct rio_portwrite { - uint32_t payload[16]; -}; - -struct rio_pw_filter { - uint32_t mask; - uint32_t low; - uint32_t high; -}; - -/* RapidIO base address for inbound requests set to value defined below - * indicates that no specific RIO-to-local address translation is requested - * and driver should use direct (one-to-one) address mapping. -*/ -#define RIO_MAP_ANY_ADDR (uint64_t)(~((uint64_t) 0)) - -struct rio_mmap { - uint32_t rioid; - uint64_t rio_addr; - uint64_t length; - uint64_t handle; - void *address; -}; - -struct rio_dma_mem { - uint64_t length; /* length of DMA memory */ - uint64_t dma_handle; /* handle associated with this memory */ - void *buffer; /* pointer to this memory */ -}; - - -struct rio_event { - unsigned int header; /* event type RIO_DOORBELL or RIO_PORTWRITE */ - union { - struct rio_doorbell doorbell; /* header for RIO_DOORBELL */ - struct rio_portwrite portwrite; /* header for RIO_PORTWRITE */ - } u; -}; - -enum rio_transfer_sync { - RIO_TRANSFER_SYNC, /* synchronous transfer */ - RIO_TRANSFER_ASYNC, /* asynchronous transfer */ - RIO_TRANSFER_FAF, /* fire-and-forget transfer */ -}; - -enum rio_transfer_dir { - RIO_TRANSFER_DIR_READ, /* Read operation */ - RIO_TRANSFER_DIR_WRITE, /* Write operation */ -}; - -/* - * RapidIO data exchange transactions are lists of individual transfers. Each - * transfer exchanges data between two RapidIO devices by remote direct memory - * access and has its own completion code. - * - * The RapidIO specification defines four types of data exchange requests: - * NREAD, NWRITE, SWRITE and NWRITE_R. The RapidIO DMA channel interface allows - * to specify the required type of write operation or combination of them when - * only the last data packet requires response. - * - * NREAD: read up to 256 bytes from remote device memory into local memory - * NWRITE: write up to 256 bytes from local memory to remote device memory - * without confirmation - * SWRITE: as NWRITE, but all addresses and payloads must be 64-bit aligned - * NWRITE_R: as NWRITE, but expect acknowledgment from remote device. - * - * The default exchange is chosen from NREAD and any of the WRITE modes as the - * driver sees fit. For write requests the user can explicitly choose between - * any of the write modes for each transaction. - */ -enum rio_exchange { - RIO_EXCHANGE_DEFAULT, /* Default method */ - RIO_EXCHANGE_NWRITE, /* All packets using NWRITE */ - RIO_EXCHANGE_SWRITE, /* All packets using SWRITE */ - RIO_EXCHANGE_NWRITE_R, /* Last packet NWRITE_R, others NWRITE */ - RIO_EXCHANGE_SWRITE_R, /* Last packet NWRITE_R, others SWRITE */ - RIO_EXCHANGE_NWRITE_R_ALL, /* All packets using NWRITE_R */ -}; - -struct rio_transfer_io { - uint32_t rioid; /* Target destID */ - uint64_t rio_addr; /* Address in target's RIO mem space */ - enum rio_exchange method; /* Data exchange method */ - void __user *loc_addr; - uint64_t handle; - uint64_t offset; /* Offset in buffer */ - uint64_t length; /* Length in bytes */ - uint32_t completion_code; /* Completion code for this transfer */ -}; - -struct rio_transaction { - uint32_t transfer_mode; /* Data transfer mode */ - enum rio_transfer_sync sync; /* Synchronization method */ - enum rio_transfer_dir dir; /* Transfer direction */ - size_t count; /* Number of transfers */ - struct rio_transfer_io __user *block; /* Array of transfers */ -}; - -struct rio_async_tx_wait { - uint32_t token; /* DMA transaction ID token */ - uint32_t timeout; /* Wait timeout in msec, if 0 use default TO */ -}; - -#define RIO_MAX_DEVNAME_SZ 20 - -struct rio_rdev_info { - uint32_t destid; - uint8_t hopcount; - uint32_t comptag; - char name[RIO_MAX_DEVNAME_SZ + 1]; -}; - -/* Driver IOCTL codes */ -#define RIO_MPORT_DRV_MAGIC 'm' - -#define RIO_MPORT_MAINT_HDID_SET \ - _IOW(RIO_MPORT_DRV_MAGIC, 1, uint16_t) -#define RIO_MPORT_MAINT_COMPTAG_SET \ - _IOW(RIO_MPORT_DRV_MAGIC, 2, uint32_t) -#define RIO_MPORT_MAINT_PORT_IDX_GET \ - _IOR(RIO_MPORT_DRV_MAGIC, 3, uint32_t) -#define RIO_MPORT_GET_PROPERTIES \ - _IOR(RIO_MPORT_DRV_MAGIC, 4, struct rio_mport_properties) -#define RIO_MPORT_MAINT_READ_LOCAL \ - _IOR(RIO_MPORT_DRV_MAGIC, 5, struct rio_mport_maint_io) -#define RIO_MPORT_MAINT_WRITE_LOCAL \ - _IOW(RIO_MPORT_DRV_MAGIC, 6, struct rio_mport_maint_io) -#define RIO_MPORT_MAINT_READ_REMOTE \ - _IOR(RIO_MPORT_DRV_MAGIC, 7, struct rio_mport_maint_io) -#define RIO_MPORT_MAINT_WRITE_REMOTE \ - _IOW(RIO_MPORT_DRV_MAGIC, 8, struct rio_mport_maint_io) -#define RIO_ENABLE_DOORBELL_RANGE \ - _IOW(RIO_MPORT_DRV_MAGIC, 9, struct rio_doorbell_filter) -#define RIO_DISABLE_DOORBELL_RANGE \ - _IOW(RIO_MPORT_DRV_MAGIC, 10, struct rio_doorbell_filter) -#define RIO_ENABLE_PORTWRITE_RANGE \ - _IOW(RIO_MPORT_DRV_MAGIC, 11, struct rio_pw_filter) -#define RIO_DISABLE_PORTWRITE_RANGE \ - _IOW(RIO_MPORT_DRV_MAGIC, 12, struct rio_pw_filter) -#define RIO_SET_EVENT_MASK \ - _IOW(RIO_MPORT_DRV_MAGIC, 13, unsigned int) -#define RIO_GET_EVENT_MASK \ - _IOR(RIO_MPORT_DRV_MAGIC, 14, unsigned int) -#define RIO_MAP_OUTBOUND \ - _IOWR(RIO_MPORT_DRV_MAGIC, 15, struct rio_mmap) -#define RIO_UNMAP_OUTBOUND \ - _IOW(RIO_MPORT_DRV_MAGIC, 16, struct rio_mmap) -#define RIO_MAP_INBOUND \ - _IOWR(RIO_MPORT_DRV_MAGIC, 17, struct rio_mmap) -#define RIO_UNMAP_INBOUND \ - _IOW(RIO_MPORT_DRV_MAGIC, 18, uint64_t) -#define RIO_ALLOC_DMA \ - _IOWR(RIO_MPORT_DRV_MAGIC, 19, struct rio_dma_mem) -#define RIO_FREE_DMA \ - _IOW(RIO_MPORT_DRV_MAGIC, 20, uint64_t) -#define RIO_TRANSFER \ - _IOWR(RIO_MPORT_DRV_MAGIC, 21, struct rio_transaction) -#define RIO_WAIT_FOR_ASYNC \ - _IOW(RIO_MPORT_DRV_MAGIC, 22, struct rio_async_tx_wait) -#define RIO_DEV_ADD \ - _IOW(RIO_MPORT_DRV_MAGIC, 23, struct rio_rdev_info) -#define RIO_DEV_DEL \ - _IOW(RIO_MPORT_DRV_MAGIC, 24, struct rio_rdev_info) - -#endif /* _RIO_MPORT_CDEV_H_ */ -- cgit v1.2.3 From 127393fbe597dd85863a9bdccaa11007e7d4948f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 May 2016 16:22:20 -0700 Subject: mm: thp: kvm: fix memory corruption in KVM with THP enabled After the THP refcounting change, obtaining a compound pages from get_user_pages() no longer allows us to assume the entire compound page is immediately mappable from a secondary MMU. A secondary MMU doesn't want to call get_user_pages() more than once for each compound page, in order to know if it can map the whole compound page. So a secondary MMU needs to know from a single get_user_pages() invocation when it can map immediately the entire compound page to avoid a flood of unnecessary secondary MMU faults and spurious atomic_inc()/atomic_dec() (pages don't have to be pinned by MMU notifier users). Ideally instead of the page->_mapcount < 1 check, get_user_pages() should return the granularity of the "page" mapping in the "mm" passed to get_user_pages(). However it's non trivial change to pass the "pmd" status belonging to the "mm" walked by get_user_pages up the stack (up to the caller of get_user_pages). So the fix just checks if there is not a single pte mapping on the page returned by get_user_pages, and in turn if the caller can assume that the whole compound page is mapped in the current "mm" (in a pmd_trans_huge()). In such case the entire compound page is safe to map into the secondary MMU without additional get_user_pages() calls on the surrounding tail/head pages. In addition of being faster, not having to run other get_user_pages() calls also reduces the memory footprint of the secondary MMU fault in case the pmd split happened as result of memory pressure. Without this fix after a MADV_DONTNEED (like invoked by QEMU during postcopy live migration or balloning) or after generic swapping (with a failure in split_huge_page() that would only result in pmd splitting and not a physical page split), KVM would map the whole compound page into the shadow pagetables, despite regular faults or userfaults (like UFFDIO_COPY) may map regular pages into the primary MMU as result of the pte faults, leading to the guest mode and userland mode going out of sync and not working on the same memory at all times. Any other secondary MMU notifier manager (KVM is just one of the many MMU notifier users) will need the same information if it doesn't want to run a flood of get_user_pages_fast and it can support multiple granularity in the secondary MMU mappings, so I think it is justified to be exposed not just to KVM. The other option would be to move transparent_hugepage_adjust to mm/huge_memory.c but that currently has all kind of KVM data structures in it, so it's definitely not a cut-and-paste work, so I couldn't do a fix as cleaner as this one for 4.6. Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: "Kirill A. Shutemov" Cc: "Li, Liang Z" Cc: Amit Shah Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f4ed4f1b0c77..6b052aa7b5b7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -516,6 +516,27 @@ static inline int PageTransCompound(struct page *page) return PageCompound(page); } +/* + * PageTransCompoundMap is the same as PageTransCompound, but it also + * guarantees the primary MMU has the entire compound page mapped + * through pmd_trans_huge, which in turn guarantees the secondary MMUs + * can also map the entire compound page. This allows the secondary + * MMUs to call get_user_pages() only once for each compound page and + * to immediately map the entire compound page with a single secondary + * MMU fault. If there will be a pmd split later, the secondary MMUs + * will get an update through the MMU notifier invalidation through + * split_huge_pmd(). + * + * Unlike PageTransCompound, this is safe to be called only while + * split_huge_pmd() cannot run from under us, like if protected by the + * MMU notifier, otherwise it may result in page->_mapcount < 0 false + * positives. + */ +static inline int PageTransCompoundMap(struct page *page) +{ + return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0; +} + /* * PageTransTail returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known @@ -559,6 +580,7 @@ static inline int TestClearPageDoubleMap(struct page *page) #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) +TESTPAGEFLAG_FALSE(TransCompoundMap) TESTPAGEFLAG_FALSE(TransTail) TESTPAGEFLAG_FALSE(DoubleMap) TESTSETFLAG_FALSE(DoubleMap) -- cgit v1.2.3