diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-class-net-queues | 11 | ||||
-rw-r--r-- | Documentation/networking/scaling.txt | 61 | ||||
-rw-r--r-- | include/linux/cpumask.h | 11 | ||||
-rw-r--r-- | include/linux/netdevice.h | 98 | ||||
-rw-r--r-- | include/net/busy_poll.h | 1 | ||||
-rw-r--r-- | include/net/sock.h | 52 | ||||
-rw-r--r-- | net/core/dev.c | 288 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 87 | ||||
-rw-r--r-- | net/core/sock.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 3 |
10 files changed, 505 insertions, 109 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-net-queues b/Documentation/ABI/testing/sysfs-class-net-queues index 0c0df91b1516..978b76358661 100644 --- a/Documentation/ABI/testing/sysfs-class-net-queues +++ b/Documentation/ABI/testing/sysfs-class-net-queues @@ -42,6 +42,17 @@ Description: network device transmit queue. Possible vaules depend on the number of available CPU(s) in the system. +What: /sys/class/<iface>/queues/tx-<queue>/xps_rxqs +Date: June 2018 +KernelVersion: 4.18.0 +Contact: netdev@vger.kernel.org +Description: + Mask of the receive queue(s) currently enabled to participate + into the Transmit Packet Steering packet processing flow for this + network device transmit queue. Possible values depend on the + number of available receive queue(s) in the network device. + Default is disabled. + What: /sys/class/<iface>/queues/tx-<queue>/byte_queue_limits/hold_time Date: November 2011 KernelVersion: 3.3 diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt index f55639d71d35..b7056a8a0540 100644 --- a/Documentation/networking/scaling.txt +++ b/Documentation/networking/scaling.txt @@ -366,8 +366,13 @@ XPS: Transmit Packet Steering Transmit Packet Steering is a mechanism for intelligently selecting which transmit queue to use when transmitting a packet on a multi-queue -device. To accomplish this, a mapping from CPU to hardware queue(s) is -recorded. The goal of this mapping is usually to assign queues +device. This can be accomplished by recording two kinds of maps, either +a mapping of CPU to hardware queue(s) or a mapping of receive queue(s) +to hardware transmit queue(s). + +1. XPS using CPUs map + +The goal of this mapping is usually to assign queues exclusively to a subset of CPUs, where the transmit completions for these queues are processed on a CPU within this set. This choice provides two benefits. First, contention on the device queue lock is @@ -377,15 +382,40 @@ transmit queue). Secondly, cache miss rate on transmit completion is reduced, in particular for data cache lines that hold the sk_buff structures. -XPS is configured per transmit queue by setting a bitmap of CPUs that -may use that queue to transmit. The reverse mapping, from CPUs to -transmit queues, is computed and maintained for each network device. -When transmitting the first packet in a flow, the function -get_xps_queue() is called to select a queue. This function uses the ID -of the running CPU as a key into the CPU-to-queue lookup table. If the +2. XPS using receive queues map + +This mapping is used to pick transmit queue based on the receive +queue(s) map configuration set by the administrator. A set of receive +queues can be mapped to a set of transmit queues (many:many), although +the common use case is a 1:1 mapping. This will enable sending packets +on the same queue associations for transmit and receive. This is useful for +busy polling multi-threaded workloads where there are challenges in +associating a given CPU to a given application thread. The application +threads are not pinned to CPUs and each thread handles packets +received on a single queue. The receive queue number is cached in the +socket for the connection. In this model, sending the packets on the same +transmit queue corresponding to the associated receive queue has benefits +in keeping the CPU overhead low. Transmit completion work is locked into +the same queue-association that a given application is polling on. This +avoids the overhead of triggering an interrupt on another CPU. When the +application cleans up the packets during the busy poll, transmit completion +may be processed along with it in the same thread context and so result in +reduced latency. + +XPS is configured per transmit queue by setting a bitmap of +CPUs/receive-queues that may use that queue to transmit. The reverse +mapping, from CPUs to transmit queues or from receive-queues to transmit +queues, is computed and maintained for each network device. When +transmitting the first packet in a flow, the function get_xps_queue() is +called to select a queue. This function uses the ID of the receive queue +for the socket connection for a match in the receive queue-to-transmit queue +lookup table. Alternatively, this function can also use the ID of the +running CPU as a key into the CPU-to-queue lookup table. If the ID matches a single queue, that is used for transmission. If multiple queues match, one is selected by using the flow hash to compute an index -into the set. +into the set. When selecting the transmit queue based on receive queue(s) +map, the transmit device is not validated against the receive device as it +requires expensive lookup operation in the datapath. The queue chosen for transmitting a particular flow is saved in the corresponding socket structure for the flow (e.g. a TCP connection). @@ -404,11 +434,15 @@ acknowledged. XPS is only available if the kconfig symbol CONFIG_XPS is enabled (on by default for SMP). The functionality remains disabled until explicitly -configured. To enable XPS, the bitmap of CPUs that may use a transmit -queue is configured using the sysfs file entry: +configured. To enable XPS, the bitmap of CPUs/receive-queues that may +use a transmit queue is configured using the sysfs file entry: +For selection based on CPUs map: /sys/class/net/<dev>/queues/tx-<n>/xps_cpus +For selection based on receive-queues map: +/sys/class/net/<dev>/queues/tx-<n>/xps_rxqs + == Suggested Configuration For a network device with a single transmission queue, XPS configuration @@ -421,6 +455,11 @@ best CPUs to share a given queue are probably those that share the cache with the CPU that processes transmit completions for that queue (transmit interrupts). +For transmit queue selection based on receive queue(s), XPS has to be +explicitly configured mapping receive-queue(s) to transmit queue(s). If the +user configuration for receive-queue map does not apply, then the transmit +queue is selected based on the CPUs map. + Per TX Queue rate limitation: ============================= diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index bf53d893ad02..57f20a0a7794 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask; #define cpu_active(cpu) ((cpu) == 0) #endif -/* verify cpu argument to cpumask_* operators */ -static inline unsigned int cpumask_check(unsigned int cpu) +static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS - WARN_ON_ONCE(cpu >= nr_cpumask_bits); + WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +} + +/* verify cpu argument to cpumask_* operators */ +static inline unsigned int cpumask_check(unsigned int cpu) +{ + cpu_max_bits_warn(cpu, nr_cpumask_bits); return cpu; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c6b377a15869..8bf8d6149f79 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -731,10 +731,15 @@ struct xps_map { */ struct xps_dev_maps { struct rcu_head rcu; - struct xps_map __rcu *cpu_map[0]; + struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */ }; -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ + +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *))) + +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\ + (_rxqs * (_tcs) * sizeof(struct xps_map *))) + #endif /* CONFIG_XPS */ #define TC_MAX_QUEUE 16 @@ -1910,7 +1915,8 @@ struct net_device { int watchdog_timeo; #ifdef CONFIG_XPS - struct xps_dev_maps __rcu *xps_maps; + struct xps_dev_maps __rcu *xps_cpus_map; + struct xps_dev_maps __rcu *xps_rxqs_map; #endif #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc __rcu *miniq_egress; @@ -3259,6 +3265,92 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index); +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, bool is_rxqs_map); + +/** + * netif_attr_test_mask - Test a CPU or Rx queue set in a mask + * @j: CPU/Rx queue index + * @mask: bitmask of all cpus/rx queues + * @nr_bits: number of bits in the bitmask + * + * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues. + */ +static inline bool netif_attr_test_mask(unsigned long j, + const unsigned long *mask, + unsigned int nr_bits) +{ + cpu_max_bits_warn(j, nr_bits); + return test_bit(j, mask); +} + +/** + * netif_attr_test_online - Test for online CPU/Rx queue + * @j: CPU/Rx queue index + * @online_mask: bitmask for CPUs/Rx queues that are online + * @nr_bits: number of bits in the bitmask + * + * Returns true if a CPU/Rx queue is online. + */ +static inline bool netif_attr_test_online(unsigned long j, + const unsigned long *online_mask, + unsigned int nr_bits) +{ + cpu_max_bits_warn(j, nr_bits); + + if (online_mask) + return test_bit(j, online_mask); + + return (j < nr_bits); +} + +/** + * netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask + * @n: CPU/Rx queue index + * @srcp: the cpumask/Rx queue mask pointer + * @nr_bits: number of bits in the bitmask + * + * Returns >= nr_bits if no further CPUs/Rx queues set. + */ +static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, + unsigned int nr_bits) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); + + if (srcp) + return find_next_bit(srcp, nr_bits, n + 1); + + return n + 1; +} + +/** + * netif_attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p + * @n: CPU/Rx queue index + * @src1p: the first CPUs/Rx queues mask pointer + * @src2p: the second CPUs/Rx queues mask pointer + * @nr_bits: number of bits in the bitmask + * + * Returns >= nr_bits if no further CPUs/Rx queues set in both. + */ +static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, + const unsigned long *src2p, + unsigned int nr_bits) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); + + if (src1p && src2p) + return find_next_and_bit(src1p, src2p, nr_bits, n + 1); + else if (src1p) + return find_next_bit(src1p, nr_bits, n + 1); + else if (src2p) + return find_next_bit(src2p, nr_bits, n + 1); + + return n + 1; +} #else static inline int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c5187438af38..9e36fda652b7 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -151,6 +151,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = skb->napi_id; #endif + sk_rx_queue_set(sk, skb); } /* variant used for unconnected sockets */ diff --git a/include/net/sock.h b/include/net/sock.h index b3b75419eafe..2ed99bfa4595 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair; * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection + * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings @@ -214,7 +215,10 @@ struct sock_common { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node; }; - int skc_tx_queue_mapping; + unsigned short skc_tx_queue_mapping; +#ifdef CONFIG_XPS + unsigned short skc_rx_queue_mapping; +#endif union { int skc_incoming_cpu; u32 skc_rcv_wnd; @@ -326,6 +330,9 @@ struct sock { #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping +#ifdef CONFIG_XPS +#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping +#endif #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end @@ -1681,19 +1688,58 @@ static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { + /* sk_tx_queue_mapping accept only upto a 16-bit value */ + if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) + return; sk->sk_tx_queue_mapping = tx_queue; } +#define NO_QUEUE_MAPPING USHRT_MAX + static inline void sk_tx_queue_clear(struct sock *sk) { - sk->sk_tx_queue_mapping = -1; + sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING; } static inline int sk_tx_queue_get(const struct sock *sk) { - return sk ? sk->sk_tx_queue_mapping : -1; + if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING) + return sk->sk_tx_queue_mapping; + + return -1; } +static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + if (skb_rx_queue_recorded(skb)) { + u16 rx_queue = skb_get_rx_queue(skb); + + if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING)) + return; + + sk->sk_rx_queue_mapping = rx_queue; + } +#endif +} + +static inline void sk_rx_queue_clear(struct sock *sk) +{ +#ifdef CONFIG_XPS + sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING; +#endif +} + +#ifdef CONFIG_XPS +static inline int sk_rx_queue_get(const struct sock *sk) +{ + if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING) + return sk->sk_rx_queue_mapping; + + return -1; +} +#endif + static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk_tx_queue_clear(sk); diff --git a/net/core/dev.c b/net/core/dev.c index dffed642e686..08d58e0debe5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2081,6 +2081,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS +struct static_key xps_needed __read_mostly; +EXPORT_SYMBOL(xps_needed); +struct static_key xps_rxqs_needed __read_mostly; +EXPORT_SYMBOL(xps_rxqs_needed); static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) @@ -2092,7 +2096,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, int pos; if (dev_maps) - map = xmap_dereference(dev_maps->cpu_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -2105,7 +2109,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, break; } - RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } @@ -2135,33 +2139,68 @@ static bool remove_xps_queue_cpu(struct net_device *dev, return active; } +static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, + struct xps_dev_maps *dev_maps, unsigned int nr_ids, + u16 offset, u16 count, bool is_rxqs_map) +{ + bool active = false; + int i, j; + + for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), + j < nr_ids;) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, + count); + if (!active) { + if (is_rxqs_map) { + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + } else { + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + + for (i = offset + (count - 1); count--; i--) + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + } + kfree_rcu(dev_maps, rcu); + } +} + static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { + const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; - int cpu, i; - bool active = false; + unsigned int nr_ids; + + if (!static_key_false(&xps_needed)) + return; mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); + if (static_key_false(&xps_rxqs_needed)) { + dev_maps = xmap_dereference(dev->xps_rxqs_map); + if (dev_maps) { + nr_ids = dev->num_rx_queues; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, + offset, count, true); + } + } + + dev_maps = xmap_dereference(dev->xps_cpus_map); if (!dev_maps) goto out_no_maps; - for_each_possible_cpu(cpu) - active |= remove_xps_queue_cpu(dev, dev_maps, cpu, - offset, count); - - if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); - } - - for (i = offset + (count - 1); count--; i--) - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + if (num_possible_cpus() > 1) + possible_mask = cpumask_bits(cpu_possible_mask); + nr_ids = nr_cpu_ids; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, + false); out_no_maps: + if (static_key_enabled(&xps_rxqs_needed)) + static_key_slow_dec(&xps_rxqs_needed); + + static_key_slow_dec(&xps_needed); mutex_unlock(&xps_map_mutex); } @@ -2170,8 +2209,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } -static struct xps_map *expand_xps_map(struct xps_map *map, - int cpu, u16 index) +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, + u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; @@ -2183,7 +2222,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return map; } - /* Need to add queue to this CPU's existing map */ + /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; @@ -2191,9 +2230,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map, alloc_len = map->alloc_len * 2; } - /* Need to allocate new map to store queue on this CPU's map */ - new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, - cpu_to_node(cpu)); + /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's + * map + */ + if (is_rxqs_map) + new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); + else + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(attr_index)); if (!new_map) return NULL; @@ -2205,14 +2249,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return new_map; } -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, - u16 index) +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, bool is_rxqs_map) { + const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; - int i, cpu, tci, numa_node_id = -2; + int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; bool active = false; + unsigned int nr_ids; if (dev->num_tc) { num_tc = dev->num_tc; @@ -2221,16 +2267,27 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -EINVAL; } - maps_sz = XPS_DEV_MAPS_SIZE(num_tc); - if (maps_sz < L1_CACHE_BYTES) - maps_sz = L1_CACHE_BYTES; - mutex_lock(&xps_map_mutex); + if (is_rxqs_map) { + maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); + dev_maps = xmap_dereference(dev->xps_rxqs_map); + nr_ids = dev->num_rx_queues; + } else { + maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); + if (num_possible_cpus() > 1) { + online_mask = cpumask_bits(cpu_online_mask); + possible_mask = cpumask_bits(cpu_possible_mask); + } + dev_maps = xmap_dereference(dev->xps_cpus_map); + nr_ids = nr_cpu_ids; + } - dev_maps = xmap_dereference(dev->xps_maps); + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; /* allocate memory for queue storage */ - for_each_cpu_and(cpu, cpu_online_mask, mask) { + for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), + j < nr_ids;) { if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { @@ -2238,73 +2295,85 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -ENOMEM; } - tci = cpu * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : + tci = j * num_tc + tc; + map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, cpu, index); + map = expand_xps_map(map, j, index, is_rxqs_map); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; - for_each_possible_cpu(cpu) { + static_key_slow_inc(&xps_needed); + if (is_rxqs_map) + static_key_slow_inc(&xps_rxqs_needed); + + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { + for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* We need to explicitly update tci as prevous loop * could break out early if dev_maps is NULL. */ - tci = cpu * num_tc + tc; + tci = j * num_tc + tc; - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { - /* add queue to CPU maps */ + if (netif_attr_test_mask(j, mask, nr_ids) && + netif_attr_test_online(j, online_mask, nr_ids)) { + /* add tx-queue to CPU/rx-queue maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (numa_node_id == -2) - numa_node_id = cpu_to_node(cpu); - else if (numa_node_id != cpu_to_node(cpu)) - numa_node_id = -1; + if (!is_rxqs_map) { + if (numa_node_id == -2) + numa_node_id = cpu_to_node(j); + else if (numa_node_id != cpu_to_node(j)) + numa_node_id = -1; + } #endif } else if (dev_maps) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* copy maps belonging to foreign traffic classes */ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } - rcu_assign_pointer(dev->xps_maps, new_dev_maps); + if (is_rxqs_map) + rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); + else + rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); - map = xmap_dereference(dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (map && map != new_map) kfree_rcu(map, rcu); } @@ -2317,19 +2386,23 @@ out_no_old_maps: active = true; out_no_new_maps: - /* update Tx queue numa node */ - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), - (numa_node_id >= 0) ? numa_node_id : - NUMA_NO_NODE); + if (!is_rxqs_map) { + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? + numa_node_id : NUMA_NO_NODE); + } if (!dev_maps) goto out_no_maps; - /* removes queue from unused CPUs */ - for_each_possible_cpu(cpu) { - for (i = tc, tci = cpu * num_tc; i--; tci++) + /* removes tx-queue from unused CPUs/rx-queues */ + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = tc, tci = j * num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); - if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) + if (!netif_attr_test_mask(j, mask, nr_ids) || + !netif_attr_test_online(j, online_mask, nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); for (i = num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); @@ -2337,7 +2410,10 @@ out_no_new_maps: /* free map if not active */ if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); + if (is_rxqs_map) + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + else + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); kfree_rcu(dev_maps, rcu); } @@ -2347,11 +2423,12 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = dev_maps ? - xmap_dereference(dev_maps->cpu_map[tci]) : + xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); @@ -2363,6 +2440,12 @@ error: kfree(new_dev_maps); return -ENOMEM; } + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); +} EXPORT_SYMBOL(netif_set_xps_queue); #endif @@ -3376,32 +3459,63 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) } #endif /* CONFIG_NET_EGRESS */ -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +#ifdef CONFIG_XPS +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, + struct xps_dev_maps *dev_maps, unsigned int tci) +{ + struct xps_map *map; + int queue_index = -1; + + if (dev->num_tc) { + tci *= dev->num_tc; + tci += netdev_get_prio_tc_map(dev, skb->priority); + } + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale( + skb_get_hash(skb), map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + return queue_index; +} +#endif + +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; - struct xps_map *map; + struct sock *sk = skb->sk; int queue_index = -1; + if (!static_key_false(&xps_needed)) + return -1; + rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + if (!static_key_false(&xps_rxqs_needed)) + goto get_cpus_map; + + dev_maps = rcu_dereference(dev->xps_rxqs_map); if (dev_maps) { - unsigned int tci = skb->sender_cpu - 1; + int tci = sk_rx_queue_get(sk); - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tci >= 0 && tci < dev->num_rx_queues) + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } - map = rcu_dereference(dev_maps->cpu_map[tci]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else - queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), - map->len)]; - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; +get_cpus_map: + if (queue_index < 0) { + dev_maps = rcu_dereference(dev->xps_cpus_map); + if (dev_maps) { + unsigned int tci = skb->sender_cpu - 1; + + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); } } rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bb7e80f4ced3..f25ac5ff48a6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, return -ENOMEM; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + dev_maps = rcu_dereference(dev->xps_cpus_map); if (dev_maps) { for_each_possible_cpu(cpu) { int i, tci = cpu * num_tc + tc; struct xps_map *map; - map = rcu_dereference(dev_maps->cpu_map[tci]); + map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; @@ -1283,6 +1283,88 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); + +static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + unsigned long *mask, index; + int j, len, num_tc = 1, tc = 0; + + index = get_netdev_queue_index(queue); + + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_rxqs_map); + if (!dev_maps) + goto out_no_maps; + + for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), + j < dev->num_rx_queues;) { + int i, tci = j * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + set_bit(j, mask); + break; + } + } + } +out_no_maps: + rcu_read_unlock(); + + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); + kfree(mask); + + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, + size_t len) +{ + struct net_device *dev = queue->dev; + struct net *net = dev_net(dev); + unsigned long *mask, index; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, mask, dev->num_rx_queues); + if (err) { + kfree(mask); + return err; + } + + err = __netif_set_xps_queue(dev, mask, index, true); + kfree(mask); + return err ? : len; +} + +static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init + = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { @@ -1290,6 +1372,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, + &xps_rxqs_attribute.attr, &queue_tx_maxrate.attr, #endif NULL diff --git a/net/core/sock.c b/net/core/sock.c index bcc41829a16d..dac6d785186b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2818,6 +2818,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0U; sk->sk_pacing_shift = 10; sk->sk_incoming_cpu = -1; + + sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eecd359595fc..a4731995e899 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -78,6 +78,7 @@ #include <linux/errqueue.h> #include <trace/events/tcp.h> #include <linux/static_key.h> +#include <net/busy_poll.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -5592,6 +5593,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); + sk_mark_napi_id(sk, skb); } tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); @@ -6420,6 +6422,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); + sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); |