From 7b3801927e52f8621de311277f7fc727635019e7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Nov 2019 14:48:58 +0100 Subject: xfrm: introduce xfrm_trans_queue_net This will be used by TCP encapsulation to write packets to the encap socket without holding the user socket's lock. Without this reinjection, we're already holding the lock of the user socket, and then try to lock the encap socket as well when we enqueue the encrypted packet. While at it, add a BUILD_BUG_ON like we usually do for skb->cb, since it's missing for struct xfrm_trans_cb. Co-developed-by: Herbert Xu Signed-off-by: Herbert Xu Signed-off-by: Sabrina Dubroca Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dda3c025452e..56ff86621bb4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1547,6 +1547,9 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); +int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)); int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)); -- cgit v1.2.3 From e27cca96cd68fa2c6814c90f9a1cfd36bb68c593 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Nov 2019 14:49:02 +0100 Subject: xfrm: add espintcp (RFC 8229) TCP encapsulation of IKE and IPsec messages (RFC 8229) is implemented as a TCP ULP, overriding in particular the sendmsg and recvmsg operations. A Stream Parser is used to extract messages out of the TCP stream using the first 2 bytes as length marker. Received IKE messages are put on "ike_queue", waiting to be dequeued by the custom recvmsg implementation. Received ESP messages are sent to XFRM, like with UDP encapsulation. Some of this code is taken from the original submission by Herbert Xu. Currently, only IPv4 is supported, like for UDP encapsulation. Co-developed-by: Herbert Xu Signed-off-by: Herbert Xu Signed-off-by: Sabrina Dubroca Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/espintcp.h | 39 +++++++++++++++++++++++++++++++++++++++ include/net/xfrm.h | 1 + 2 files changed, 40 insertions(+) create mode 100644 include/net/espintcp.h (limited to 'include/net') diff --git a/include/net/espintcp.h b/include/net/espintcp.h new file mode 100644 index 000000000000..dd7026a00066 --- /dev/null +++ b/include/net/espintcp.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_ESPINTCP_H +#define _NET_ESPINTCP_H + +#include +#include + +void __init espintcp_init(void); + +int espintcp_push_skb(struct sock *sk, struct sk_buff *skb); +int espintcp_queue_out(struct sock *sk, struct sk_buff *skb); +bool tcp_is_ulp_esp(struct sock *sk); + +struct espintcp_msg { + struct sk_buff *skb; + struct sk_msg skmsg; + int offset; + int len; +}; + +struct espintcp_ctx { + struct strparser strp; + struct sk_buff_head ike_queue; + struct sk_buff_head out_queue; + struct espintcp_msg partial; + void (*saved_data_ready)(struct sock *sk); + void (*saved_write_space)(struct sock *sk); + struct work_struct work; + bool tx_running; +}; + +static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* RCU is only needed for diag */ + return (__force void *)icsk->icsk_ulp_data; +} +#endif diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 56ff86621bb4..8f71c111e65a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -193,6 +193,7 @@ struct xfrm_state { /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; + struct sock __rcu *encap_sk; /* Data for care-of address */ xfrm_address_t *coaddr; -- cgit v1.2.3 From 65e6d90168f3593df0ae598502bcbf20d78ff0fb Mon Sep 17 00:00:00 2001 From: "Kevin(Yudong) Yang" Date: Mon, 9 Dec 2019 14:19:59 -0500 Subject: net-tcp: Disable TCP ssthresh metrics cache by default This patch introduces a sysctl knob "net.ipv4.tcp_no_ssthresh_metrics_save" that disables TCP ssthresh metrics cache by default. Other parts of TCP metrics cache, e.g. rtt, cwnd, remain unchanged. As modern networks becoming more and more dynamic, TCP metrics cache today often causes more harm than benefits. For example, the same IP address is often shared by different subscribers behind NAT in residential networks. Even if the IP address is not shared by different users, caching the slow-start threshold of a previous short flow using loss-based congestion control (e.g. cubic) often causes the future longer flows of the same network path to exit slow-start prematurely with abysmal throughput. Caching ssthresh is very risky and can lead to terrible performance. Therefore it makes sense to make disabling ssthresh caching by default and opt-in for specific networks by the administrators. This practice also has worked well for several years of deployment with CUBIC congestion control at Google. Acked-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: Kevin(Yudong) Yang Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index c0c0791b1912..08b98414d94e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -154,6 +154,7 @@ struct netns_ipv4 { int sysctl_tcp_adv_win_scale; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; + int sysctl_tcp_no_ssthresh_metrics_save; int sysctl_tcp_moderate_rcvbuf; int sysctl_tcp_tso_win_divisor; int sysctl_tcp_workaround_signed_windows; -- cgit v1.2.3 From 0e12190578d081d4fe54d41635ec6e5a6eb0d01a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 10 Dec 2019 11:43:04 +0100 Subject: vsock: add local transport support in the vsock core This patch allows to register a transport able to handle local communication (loopback). Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 4206dc6d813f..b1c717286993 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -98,6 +98,8 @@ struct vsock_transport_send_notify_data { #define VSOCK_TRANSPORT_F_G2H 0x00000002 /* Transport provides DGRAM communication */ #define VSOCK_TRANSPORT_F_DGRAM 0x00000004 +/* Transport provides local (loopback) communication */ +#define VSOCK_TRANSPORT_F_LOCAL 0x00000008 struct vsock_transport { struct module *module; -- cgit v1.2.3 From 3c32da19a858fb1ae8a76bf899160be49f338506 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 9 Dec 2019 13:03:46 +0300 Subject: unix: Show number of pending scm files of receive queue in fdinfo Unix sockets like a block box. You never know what is stored there: there may be a file descriptor holding a mount or a block device, or there may be whole universes with namespaces, sockets with receive queues full of sockets etc. The patch adds a little debug and accounts number of files (not recursive), which is in receive queue of a unix socket. Sometimes this is useful to determine, that socket should be investigated or which task should be killed to put reference counter on a resourse. v2: Pass correct argument to lockdep Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/net/af_unix.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 3426d6dacc45..17e10fba2152 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -41,6 +41,10 @@ struct unix_skb_parms { u32 consumed; } __randomize_layout; +struct scm_stat { + u32 nr_fds; +}; + #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) #define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) @@ -65,6 +69,7 @@ struct unix_sock { #define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; wait_queue_entry_t peer_wake; + struct scm_stat scm_stat; }; static inline struct unix_sock *unix_sk(const struct sock *sk) -- cgit v1.2.3 From 32d5109a9d864aea3981f0b5ea736eee4e11b42a Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 11 Dec 2019 10:58:19 +0100 Subject: netlink: rename nl80211_validate_nested() to nla_validate_nested() Function nl80211_validate_nested() is not specific to nl80211, it's a counterpart to nla_validate_nested_deprecated() with strict validation. For consistency with other validation and parse functions, rename it to nla_validate_nested(). Signed-off-by: Michal Kubecek Acked-by: Jiri Pirko Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index b140c8f1be22..56c365dc6dc7 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1735,7 +1735,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) } /** - * nla_validate_nested - Validate a stream of nested attributes + * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy @@ -1758,9 +1758,9 @@ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, } static inline int -nl80211_validate_nested(const struct nlattr *start, int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +nla_validate_nested(const struct nlattr *start, int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_STRICT, extack); -- cgit v1.2.3 From 1f1c1d7c89ee538f3e36b43098e95973f8fa37db Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 13 Dec 2019 21:24:27 +0100 Subject: ipv6: Annotate bitwise IPv6 dsfield pointer cast The sparse commit 6002ded74587 ("add a flag to warn on casts to/from bitwise pointers") introduced a check for non-direct casts from/to restricted datatypes (when -Wbitwise-pointer is enabled). This triggered a warning in ipv6_get_dsfield() because sparse doesn't know that the buffer already points to some data in the correct bitwise integer format. This was already fixed in ipv6_change_dsfield() by the __force attribute and can be fixed here the same way. Signed-off-by: Sven Eckelmann Signed-off-by: David S. Miller --- include/net/dsfield.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsfield.h b/include/net/dsfield.h index 1a245ee10c95..a59a57ffc546 100644 --- a/include/net/dsfield.h +++ b/include/net/dsfield.h @@ -21,7 +21,7 @@ static inline __u8 ipv4_get_dsfield(const struct iphdr *iph) static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h) { - return ntohs(*(const __be16 *)ipv6h) >> 4; + return ntohs(*(__force const __be16 *)ipv6h) >> 4; } -- cgit v1.2.3 From 54e1f08bddbe63a3c0ae44f65df2c8b895003ef4 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 13 Dec 2019 21:24:28 +0100 Subject: ipv6: Annotate ipv6_addr_is_* bitwise pointer casts The sparse commit 6002ded74587 ("add a flag to warn on casts to/from bitwise pointers") introduced a check for non-direct casts from/to restricted datatypes (when -Wbitwise-pointer is enabled). This triggered a warning in the 64 bit optimized ipv6_addr_is_*() functions because sparse doesn't know that the buffer already points to some data in the correct bitwise integer format. But these were correct and can therefore be marked with __force to signalize sparse an intended cast to a specific bitwise type. Signed-off-by: Sven Eckelmann Signed-off-by: David S. Miller --- include/net/addrconf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 1bab88184d3c..a088349dd94f 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -437,7 +437,7 @@ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 - __be64 *p = (__be64 *)addr; + __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | @@ -449,7 +449,7 @@ static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 - __be64 *p = (__be64 *)addr; + __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | @@ -466,7 +466,7 @@ static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr) static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 - __be64 *p = (__be64 *)addr; + __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) & cpu_to_be64(0xffffffffff000000UL))) == 0UL; @@ -481,7 +481,7 @@ static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 - __be64 *p = (__be64 *)addr; + __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(0x6a))) == 0UL; -- cgit v1.2.3 From 9586a992fb752b14ac18fc86fe086b0e3372fff4 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 18 Dec 2019 14:55:08 +0000 Subject: net: pkt_cls: Clarify a comment The bit about negating HW backlog left me scratching my head. Clarify the comment. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e553fc80eb23..a7c5d492bc04 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -791,9 +791,8 @@ enum tc_prio_command { struct tc_prio_qopt_offload_params { int bands; u8 priomap[TC_PRIO_MAX + 1]; - /* In case that a prio qdisc is offloaded and now is changed to a - * non-offloadedable config, it needs to update the backlog & qlen - * values to negate the HW backlog & qlen values (and only them). + /* At the point of un-offloading the Qdisc, the reported backlog and + * qlen need to be reduced by the portion that is in HW. */ struct gnet_stats_queue *qstats; }; -- cgit v1.2.3 From d35eb52bd2ac7557b62bda52668f2e64dde2cf90 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 18 Dec 2019 14:55:15 +0000 Subject: net: sch_ets: Make the ETS qdisc offloadable Add hooks at appropriate points to make it possible to offload the ETS Qdisc. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a7c5d492bc04..47b115e2012a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -823,4 +823,35 @@ struct tc_root_qopt_offload { bool ingress; }; +enum tc_ets_command { + TC_ETS_REPLACE, + TC_ETS_DESTROY, + TC_ETS_STATS, + TC_ETS_GRAFT, +}; + +struct tc_ets_qopt_offload_replace_params { + unsigned int bands; + u8 priomap[TC_PRIO_MAX + 1]; + unsigned int quanta[TCQ_ETS_MAX_BANDS]; /* 0 for strict bands. */ + unsigned int weights[TCQ_ETS_MAX_BANDS]; + struct gnet_stats_queue *qstats; +}; + +struct tc_ets_qopt_offload_graft_params { + u8 band; + u32 child_handle; +}; + +struct tc_ets_qopt_offload { + enum tc_ets_command command; + u32 handle; + u32 parent; + union { + struct tc_ets_qopt_offload_replace_params replace_params; + struct tc_qopt_offload_stats stats; + struct tc_ets_qopt_offload_graft_params graft_params; + }; +}; + #endif -- cgit v1.2.3 From 8d5a49e9e31ba1ddd34a54b2351d068a90c78707 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Dec 2019 14:12:01 -0800 Subject: net/tls: add helper for testing if socket is RX offloaded There is currently no way for driver to reliably check that the socket it has looked up is in fact RX offloaded. Add a helper. This allows drivers to catch misbehaving firmware. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tls.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index df630f5fc723..bf9eb4823933 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -641,6 +641,7 @@ int tls_sw_fallback_init(struct sock *sk, #ifdef CONFIG_TLS_DEVICE void tls_device_init(void); void tls_device_cleanup(void); +void tls_device_sk_destruct(struct sock *sk); int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); void tls_device_free_resources_tx(struct sock *sk); int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); @@ -649,6 +650,14 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq); int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, struct sk_buff *skb, struct strp_msg *rxm); + +static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk) +{ + if (!sk_fullsock(sk) || + smp_load_acquire(&sk->sk_destruct) != tls_device_sk_destruct) + return false; + return tls_get_ctx(sk)->rx_conf == TLS_HW; +} #else static inline void tls_device_init(void) {} static inline void tls_device_cleanup(void) {} -- cgit v1.2.3 From e312b9e706ed6d94f6cc9088fcd9fbd81de4525c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:10:02 +0100 Subject: xsk: Make xskmap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xskmap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all xskmaps, which simplifies __xsk_map_flush() and xsk_map_alloc(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-5-bjorn.topel@gmail.com --- include/net/xdp_sock.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e3780e4b74e1..48594740d67c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -72,7 +72,6 @@ struct xdp_umem { struct xsk_map { struct bpf_map map; - struct list_head __percpu *flush_list; spinlock_t lock; /* Synchronize map updates */ struct xdp_sock *xsk_map[]; }; @@ -139,9 +138,8 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); int xsk_map_inc(struct xsk_map *map); void xsk_map_put(struct xsk_map *map); -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs); -void __xsk_map_flush(struct bpf_map *map); +int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); +void __xsk_map_flush(void); static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) @@ -369,13 +367,12 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, return 0; } -static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) +static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; } -static inline void __xsk_map_flush(struct bpf_map *map) +static inline void __xsk_map_flush(void) { } -- cgit v1.2.3 From 03896ef1f0cb23d2742ddf486c531c700a2da7d6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:27 +0100 Subject: xsk: Change names of validation functions Change the names of the validation functions to better reflect what they are doing. The uppermost ones are reading entries from the rings and only the bottom ones validate entries. So xskq_cons_read_ is a better prefix name. Also change the xskq_cons_read_ functions to return a bool as the the descriptor or address is already returned by reference in the parameters. Everyone is using the return value as a bool anyway. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-9-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 48594740d67c..63f005830866 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -118,7 +118,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); @@ -197,7 +197,7 @@ static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) return xsk_umem_has_addrs(umem, cnt - rq->length); } -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) { struct xdp_umem_fq_reuse *rq = umem->fq_reuse; -- cgit v1.2.3 From f8509aa078de0842ec1817e8026e58620cd05d3b Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:28 +0100 Subject: xsk: ixgbe: i40e: ice: mlx5: Xsk_umem_discard_addr to xsk_umem_release_addr Change the name of xsk_umem_discard_addr to xsk_umem_release_addr to better reflect the new naming of the AF_XDP queue manipulation functions. As this functions is used by drivers implementing support for AF_XDP zero-copy, it requires a name change to these drivers. The function xsk_umem_release_addr_rq has also changed name in the same fashion. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-10-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 63f005830866..e86ec48ef627 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -119,7 +119,7 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_discard_addr(struct xdp_umem *umem); +void xsk_umem_release_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); void xsk_umem_consume_tx_done(struct xdp_umem *umem); @@ -208,12 +208,12 @@ static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) return addr; } -static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) { struct xdp_umem_fq_reuse *rq = umem->fq_reuse; if (!rq->length) - xsk_umem_discard_addr(umem); + xsk_umem_release_addr(umem); else rq->length--; } @@ -258,7 +258,7 @@ static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) return NULL; } -static inline void xsk_umem_discard_addr(struct xdp_umem *umem) +static inline void xsk_umem_release_addr(struct xdp_umem *umem) { } @@ -332,7 +332,7 @@ static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) return NULL; } -static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) { } -- cgit v1.2.3 From 48fda74f0a9377ce2145ac5f06b6db0274880826 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 18 Dec 2019 09:02:14 +0100 Subject: net: dsa: add support for Atheros AR9331 TAG format Add support for tag format used in Atheros AR9331 built-in switch. Reviewed-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6767dc3f66c0..da5578db228e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -43,6 +43,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 #define DSA_TAG_PROTO_OCELOT_VALUE 15 +#define DSA_TAG_PROTO_AR9331_VALUE 16 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -61,6 +62,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, + DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, }; struct packet_type; -- cgit v1.2.3 From 6b722237b656d045c0b9bab9966a5e46604258ba Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:12 +0200 Subject: net: fib_notifier: Add temporary events to the FIB notification chain Subsequent patches are going to simplify the IPv6 route offload API, which will only use three events - replace, delete and append. Introduce a temporary version of replace and delete in order to make the conversion easier to review. Note that append does not need a temporary version, as it is currently not used. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 6d59221ff05a..b3c54325caec 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -23,6 +23,8 @@ enum fib_event_type { FIB_EVENT_NH_DEL, FIB_EVENT_VIF_ADD, FIB_EVENT_VIF_DEL, + FIB_EVENT_ENTRY_REPLACE_TMP, + FIB_EVENT_ENTRY_DEL_TMP, }; struct fib_notifier_ops { -- cgit v1.2.3 From d2f0c9b11410f9c6a07c126f8a215b0b81cdcf6c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:17 +0200 Subject: ipv6: Handle route deletion notification For the purpose of route offload, when a single route is deleted, it is only of interest if it is the first route in the node or if it is sibling to such a route. In the first case, distinguish between several possibilities: 1. Route is the last route in the node. Emit a delete notification 2. Route is followed by a non-multipath route. Emit a replace notification for the non-multipath route. 3. Route is followed by a multipath route. Emit a replace notification for the multipath route. In the second case, only emit a delete notification to ensure the route is no longer used as a valid nexthop. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f1535f172935..b579faea41e9 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -487,6 +487,7 @@ int call_fib6_multipath_entry_notifiers(struct net *net, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack); +int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt); void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, -- cgit v1.2.3 From caafb2509fac1432849650826953dd88b7cbe374 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:20 +0200 Subject: ipv6: Remove old route notifications and convert listeners Now that mlxsw is converted to use the new FIB notifications it is possible to delete the old ones and use the new replace / append / delete notifications. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index b3c54325caec..6d59221ff05a 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -23,8 +23,6 @@ enum fib_event_type { FIB_EVENT_NH_DEL, FIB_EVENT_VIF_ADD, FIB_EVENT_VIF_DEL, - FIB_EVENT_ENTRY_REPLACE_TMP, - FIB_EVENT_ENTRY_DEL_TMP, }; struct fib_notifier_ops { -- cgit v1.2.3 From dea53bb80e07b9e1641b865493908c20cb8df2ac Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Dec 2019 14:14:28 -0800 Subject: tcp: Add l3index to tcp_md5sig_key and md5 functions Add l3index to tcp_md5sig_key to represent the L3 domain of a key, and add l3index to tcp_md5_do_add and tcp_md5_do_del to fill in the key. With the key now based on an l3index, add the new parameter to the lookup functions and consider the l3index when looking for a match. The l3index comes from the skb when processing ingress packets leveraging the helpers created for socket lookups, tcp_v4_sdif and inet_iif (and the v6 variants). When the sdif index is set it means the packet ingressed a device that is part of an L3 domain and inet_iif points to the VRF device. For egress, the L3 domain is determined from the socket binding and sk_bound_dev_if. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index e460ea7f767b..7df37e2fddca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1532,8 +1532,9 @@ struct tcp_md5sig_key { struct hlist_node node; u8 keylen; u8 family; /* AF_INET or AF_INET6 */ - union tcp_md5_addr addr; u8 prefixlen; + union tcp_md5_addr addr; + int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; struct rcu_head rcu; }; @@ -1577,34 +1578,33 @@ struct tcp_md5sig_pool { int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, - gfp_t gfp); + int family, u8 prefixlen, int l3index, + const u8 *newkey, u8 newkeylen, gfp_t gfp); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen); + int family, u8 prefixlen, int l3index); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG #include extern struct static_key_false tcp_md5_needed; -struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family); static inline struct tcp_md5sig_key * -tcp_md5_do_lookup(const struct sock *sk, - const union tcp_md5_addr *addr, - int family) +tcp_md5_do_lookup(const struct sock *sk, int l3index, + const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed)) return NULL; - return __tcp_md5_do_lookup(sk, addr, family); + return __tcp_md5_do_lookup(sk, l3index, addr, family); } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else -static inline struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, - const union tcp_md5_addr *addr, - int family) +static inline struct tcp_md5sig_key * +tcp_md5_do_lookup(const struct sock *sk, int l3index, + const union tcp_md5_addr *addr, int family) { return NULL; } -- cgit v1.2.3 From 36278a5d4d354e5d5610aa728831db9e03cc3d8d Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Wed, 11 Dec 2019 01:54:43 +0000 Subject: Bluetooth: Adding a bt_dev_warn_ratelimited macro. The macro will be used to display rate limited warning messages in the log. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index fabee6db0abb..bd2675266859 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -129,6 +129,8 @@ void bt_warn(const char *fmt, ...); __printf(1, 2) void bt_err(const char *fmt, ...); __printf(1, 2) +void bt_warn_ratelimited(const char *fmt, ...); +__printf(1, 2) void bt_err_ratelimited(const char *fmt, ...); #define BT_INFO(fmt, ...) bt_info(fmt "\n", ##__VA_ARGS__) @@ -147,6 +149,8 @@ void bt_err_ratelimited(const char *fmt, ...); #define bt_dev_dbg(hdev, fmt, ...) \ BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) +#define bt_dev_warn_ratelimited(hdev, fmt, ...) \ + bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ BT_ERR_RATELIMITED("%s: " fmt, (hdev)->name, ##__VA_ARGS__) -- cgit v1.2.3 From 657cc646475b721f5c5bab82e7fd43302c7c8358 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 11 Dec 2019 11:34:36 +0100 Subject: Bluetooth: Remove usage of BT_ERR_RATELIMITED macro The macro is really not needed and can be replaced with either usage of bt_err_ratelimited or bt_dev_err_ratelimited. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index bd2675266859..e42bb8e03c09 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -138,8 +138,6 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) -#define BT_ERR_RATELIMITED(fmt, ...) bt_err_ratelimited(fmt "\n", ##__VA_ARGS__) - #define bt_dev_info(hdev, fmt, ...) \ BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_warn(hdev, fmt, ...) \ @@ -152,7 +150,7 @@ void bt_err_ratelimited(const char *fmt, ...); #define bt_dev_warn_ratelimited(hdev, fmt, ...) \ bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ - BT_ERR_RATELIMITED("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) /* Connection and socket states */ enum { -- cgit v1.2.3 From 1efd927d660e6ab02a9cd32fbbe3c7dc47980132 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 2 Jan 2020 15:00:55 -0800 Subject: Bluetooth: Add support for LE PHY Update Complete event This handles LE PHY Update Complete event and store both tx_phy and rx_phy into hci_conn. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 8 ++++++++ include/net/bluetooth/hci_core.h | 2 ++ 2 files changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 5bc1e30dedde..07b6ecedc6ce 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2186,6 +2186,14 @@ struct hci_ev_le_direct_adv_info { __s8 rssi; } __packed; +#define HCI_EV_LE_PHY_UPDATE_COMPLETE 0x0c +struct hci_ev_le_phy_update_complete { + __u8 status; + __u16 handle; + __u8 tx_phy; + __u8 rx_phy; +} __packed; + #define HCI_EV_LE_EXT_ADV_REPORT 0x0d struct hci_ev_le_ext_adv_report { __le16 evt_type; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b689aceb636b..faebe3859931 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -493,6 +493,8 @@ struct hci_conn { __u16 le_supv_timeout; __u8 le_adv_data[HCI_MAX_AD_LENGTH]; __u8 le_adv_data_len; + __u8 le_tx_phy; + __u8 le_rx_phy; __s8 rssi; __s8 tx_power; __s8 max_tx_power; -- cgit v1.2.3 From a68578c20a9667463ee3000402b21644ea62d753 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 4 Jan 2020 02:37:10 +0200 Subject: net: dsa: Make deferred_xmit private to sja1105 There are 3 things that are wrong with the DSA deferred xmit mechanism: 1. Its introduction has made the DSA hotpath ever so slightly more inefficient for everybody, since DSA_SKB_CB(skb)->deferred_xmit needs to be initialized to false for every transmitted frame, in order to figure out whether the driver requested deferral or not (a very rare occasion, rare even for the only driver that does use this mechanism: sja1105). That was necessary to avoid kfree_skb from freeing the skb. 2. Because L2 PTP is a link-local protocol like STP, it requires management routes and deferred xmit with this switch. But as opposed to STP, the deferred work mechanism needs to schedule the packet rather quickly for the TX timstamp to be collected in time and sent to user space. But there is no provision for controlling the scheduling priority of this deferred xmit workqueue. Too bad this is a rather specific requirement for a feature that nobody else uses (more below). 3. Perhaps most importantly, it makes the DSA core adhere a bit too much to the NXP company-wide policy "Innovate Where It Doesn't Matter". The sja1105 is probably the only DSA switch that requires some frames sent from the CPU to be routed to the slave port via an out-of-band configuration (register write) rather than in-band (DSA tag). And there are indeed very good reasons to not want to do that: if that out-of-band register is at the other end of a slow bus such as SPI, then you limit that Ethernet flow's throughput to effectively the throughput of the SPI bus. So hardware vendors should definitely not be encouraged to design this way. We do _not_ want more widespread use of this mechanism. Luckily we have a solution for each of the 3 issues: For 1, we can just remove that variable in the skb->cb and counteract the effect of kfree_skb with skb_get, much to the same effect. The advantage, of course, being that anybody who doesn't use deferred xmit doesn't need to do any extra operation in the hotpath. For 2, we can create a kernel thread for each port's deferred xmit work. If the user switch ports are named swp0, swp1, swp2, the kernel threads will be named swp0_xmit, swp1_xmit, swp2_xmit (there appears to be a 15 character length limit on kernel thread names). With this, the user can change the scheduling priority with chrt $(pidof swp2_xmit). For 3, we can actually move the entire implementation to the sja1105 driver. So this patch deletes the generic implementation from the DSA core and adds a new one, more adequate to the requirements of PTP TX timestamping, in sja1105_main.c. Suggested-by: Florian Fainelli Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index da5578db228e..23b1c58656d4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -90,7 +90,6 @@ struct dsa_device_ops { struct dsa_skb_cb { struct sk_buff *clone; - bool deferred_xmit; }; struct __dsa_skb_cb { @@ -192,9 +191,6 @@ struct dsa_port { struct phylink *pl; struct phylink_config pl_config; - struct work_struct xmit_work; - struct sk_buff_head xmit_queue; - struct list_head list; /* @@ -564,11 +560,6 @@ struct dsa_switch_ops { bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); - /* - * Deferred frame Tx - */ - netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port, - struct sk_buff *skb); /* Devlink parameters */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); -- cgit v1.2.3 From 787cac3f5a650fd3184a41c5a27a2fe9ded833aa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 6 Jan 2020 03:34:12 +0200 Subject: net: dsa: Pass pcs_poll flag from driver to PHYLINK The DSA drivers that implement .phylink_mac_link_state should normally register an interrupt for the PCS, from which they should call phylink_mac_change(). However not all switches implement this, and those who don't should set this flag in dsa_switch in the .setup callback, so that PHYLINK will poll for a few ms until the in-band AN link timer expires and the PCS state settles. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 23b1c58656d4..0c39fed8cd99 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -279,6 +279,11 @@ struct dsa_switch { */ bool vlan_filtering; + /* MAC PCS does not provide link state change interrupt, and requires + * polling. Flag passed on to PHYLINK. + */ + bool pcs_poll; + size_t num_ports; }; -- cgit v1.2.3 From 6181e5cb752e5de9f56fbcee3f0206a2c51f1478 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 2 Jan 2020 21:18:09 +0530 Subject: devlink: add support for reporter recovery completion It is possible that a reporter recovery completion do not finish successfully when recovery is triggered via devlink_health_reporter_recover as recovery could be processed in different context. In such scenario an error is returned by driver when recover hook is invoked and successful recovery completion is intimated later. Expose devlink recover done API to update recovery stats. Signed-off-by: Vikas Gupta Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 47f87b2fcf63..453f45cc1519 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1000,6 +1000,8 @@ int devlink_health_report(struct devlink_health_reporter *reporter, void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state); +void +devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter); bool devlink_is_reload_failed(const struct devlink *devlink); -- cgit v1.2.3 From 4d776482ecc689bdd68627985ac4cb5a6f325953 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 7 Jan 2020 21:06:05 -0800 Subject: net: dsa: Get information about stacked DSA protocol It is possible to stack multiple DSA switches in a way that they are not part of the tree (disjoint) but the DSA master of a switch is a DSA slave of another. When that happens switch drivers may have to know this is the case so as to determine whether their tagging protocol has a remove chance of working. This is useful for specific switch drivers such as b53 where devices have been known to be stacked in the wild without the Broadcom tag protocol supporting that feature. This allows b53 to continue supporting those devices by forcing the disabling of Broadcom tags on the outermost switches if necessary. The get_tag_protocol() function is therefore updated to gain an additional enum dsa_tag_protocol argument which denotes the current tagging protocol used by the DSA master we are attached to, else DSA_TAG_PROTO_NONE for the top of the dsa_switch_tree. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 0c39fed8cd99..63495e3443ac 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -380,7 +380,8 @@ typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, - int port); + int port, + enum dsa_tag_protocol mprot); int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); -- cgit v1.2.3 From 0baf26b0fcd74bbfcef53c5d5e8bad2b99c8d0d2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:08 -0800 Subject: bpf: tcp: Support tcp_congestion_ops in bpf This patch makes "struct tcp_congestion_ops" to be the first user of BPF STRUCT_OPS. It allows implementing a tcp_congestion_ops in bpf. The BPF implemented tcp_congestion_ops can be used like regular kernel tcp-cc through sysctl and setsockopt. e.g. [root@arch-fb-vm1 bpf]# sysctl -a | egrep congestion net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic net.ipv4.tcp_congestion_control = bpf_cubic There has been attempt to move the TCP CC to the user space (e.g. CCP in TCP). The common arguments are faster turn around, get away from long-tail kernel versions in production...etc, which are legit points. BPF has been the continuous effort to join both kernel and userspace upsides together (e.g. XDP to gain the performance advantage without bypassing the kernel). The recent BPF advancements (in particular BTF-aware verifier, BPF trampoline, BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc) possible in BPF. It allows a faster turnaround for testing algorithm in the production while leveraging the existing (and continue growing) BPF feature/framework instead of building one specifically for userspace TCP CC. This patch allows write access to a few fields in tcp-sock (in bpf_tcp_ca_btf_struct_access()). The optional "get_info" is unsupported now. It can be added later. One possible way is to output the info with a btf-id to describe the content. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003508.3856115-1-kafai@fb.com --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7df37e2fddca..9dd975be7fdf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1007,6 +1007,7 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; @@ -1101,6 +1102,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; +struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET -- cgit v1.2.3 From e9cdced78dc20c1592c1fb98ed064943007a46c5 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:14 -0800 Subject: net: Make sock protocol value checks more specific SK_PROTOCOL_MAX is only used in two places, for DECNet and AX.25. The limits have more to do with the those protocol definitions than they do with the data type of sk_protocol, so remove SK_PROTOCOL_MAX and use U8_MAX directly. Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 8dff68b4c316..091e55428415 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -458,7 +458,6 @@ struct sock { sk_userlocks : 4, sk_protocol : 8, sk_type : 16; -#define SK_PROTOCOL_MAX U8_MAX u16 sk_gso_max_segs; u8 sk_pacing_shift; unsigned long sk_lingertime; -- cgit v1.2.3 From bf9765145b856fa2e238a5b8a54453795ba30ad6 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:15 -0800 Subject: sock: Make sk_protocol a 16-bit value Match the 16-bit width of skbuff->protocol. Fills an 8-bit hole so sizeof(struct sock) does not change. Also take care of BPF field access for sk_type/sk_protocol. Both of them are now outside the bitfield, so we can use load instructions without further shifting/masking. v5 -> v6: - update eBPF accessors, too (Intel's kbuild test robot) v2 -> v3: - keep 'sk_type' 2 bytes aligned (Eric) v1 -> v2: - preserve sk_pacing_shift as bit field (Eric) Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: bpf@vger.kernel.org Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 091e55428415..8766f9bc3e70 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -436,30 +436,15 @@ struct sock { * Because of non atomicity rules, all * changes are protected by socket lock. */ - unsigned int __sk_flags_offset[0]; -#ifdef __BIG_ENDIAN_BITFIELD -#define SK_FL_PROTO_SHIFT 16 -#define SK_FL_PROTO_MASK 0x00ff0000 - -#define SK_FL_TYPE_SHIFT 0 -#define SK_FL_TYPE_MASK 0x0000ffff -#else -#define SK_FL_PROTO_SHIFT 8 -#define SK_FL_PROTO_MASK 0x0000ff00 - -#define SK_FL_TYPE_SHIFT 16 -#define SK_FL_TYPE_MASK 0xffff0000 -#endif - - unsigned int sk_padding : 1, + u8 sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1, - sk_userlocks : 4, - sk_protocol : 8, - sk_type : 16; - u16 sk_gso_max_segs; + sk_userlocks : 4; u8 sk_pacing_shift; + u16 sk_type; + u16 sk_protocol; + u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; -- cgit v1.2.3 From c74a39c861aeaf6b789b7abdbb3256f54c9fb365 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:17 -0800 Subject: tcp: Add MPTCP option number TCP option 30 is allocated for MPTCP by the IANA. Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7df37e2fddca..85f1d7ff6e8b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -182,6 +182,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ +#define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP -- cgit v1.2.3 From 1323059301c8f36d933876233516245d882346a6 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:18 -0800 Subject: tcp, ulp: Add clone operation to tcp_ulp_ops If ULP is used on a listening socket, icsk_ulp_ops and icsk_ulp_data are copied when the listener is cloned. Sometimes the clone is immediately deleted, which will invoke the release op on the clone and likely corrupt the listening socket's icsk_ulp_data. The clone operation is invoked immediately after the clone is copied and gives the ULP type an opportunity to set up the clone socket and its icsk_ulp_data. The MPTCP ULP clone will silently fallback to plain TCP on allocation failure, so 'clone()' does not need to return an error code. v6 -> v7: - move and rename ulp clone helper to make it inline-friendly v5 -> v6: - clarified MPTCP clone usage in commit message Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 85f1d7ff6e8b..ac52633e7061 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2154,6 +2154,9 @@ struct tcp_ulp_ops { /* diagnostic */ int (*get_info)(const struct sock *sk, struct sk_buff *skb); size_t (*get_info_size)(const struct sock *sk); + /* clone ulp */ + void (*clone)(const struct request_sock *req, struct sock *newsk, + const gfp_t priority); char name[TCP_ULP_NAME_MAX]; struct module *owner; -- cgit v1.2.3 From 3ee17bc78e0f3fdeff9890993e8f3a9f5145163b Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:19 -0800 Subject: mptcp: Add MPTCP to skb extensions Add enum value for MPTCP and update config dependencies v5 -> v6: - fixed '__unused' field size Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 include/net/mptcp.h (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h new file mode 100644 index 000000000000..326043c29c0a --- /dev/null +++ b/include/net/mptcp.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#ifndef __NET_MPTCP_H +#define __NET_MPTCP_H + +#include + +/* MPTCP sk_buff extension data */ +struct mptcp_ext { + u64 data_ack; + u64 data_seq; + u32 subflow_seq; + u16 data_len; + u8 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + __unused:3; + /* one byte hole */ +}; + +#endif /* __NET_MPTCP_H */ -- cgit v1.2.3 From 85712484110df308215077be6ee21c4e57d7dec2 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:20 -0800 Subject: tcp: coalesce/collapse must respect MPTCP extensions Coalesce and collapse of packets carrying MPTCP extensions is allowed when the newer packet has no extension or the extensions carried by both packets are equal. This allows merging of TSO packet trains and even cross-TSO packets, and does not require any additional action when moving data into existing SKBs. v3 -> v4: - allow collapsing, under mptcp_skb_can_collapse() constraint v5 -> v6: - clarify MPTCP skb extensions must always be cleared at allocation time Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++ include/net/tcp.h | 8 ++++++++ 2 files changed, 65 insertions(+) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 326043c29c0a..0573ae75c3db 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -8,6 +8,7 @@ #ifndef __NET_MPTCP_H #define __NET_MPTCP_H +#include #include /* MPTCP sk_buff extension data */ @@ -25,4 +26,60 @@ struct mptcp_ext { /* one byte hole */ }; +#ifdef CONFIG_MPTCP + +/* move the skb extension owership, with the assumption that 'to' is + * newly allocated + */ +static inline void mptcp_skb_ext_move(struct sk_buff *to, + struct sk_buff *from) +{ + if (!skb_ext_exist(from, SKB_EXT_MPTCP)) + return; + + if (WARN_ON_ONCE(to->active_extensions)) + skb_ext_put(to); + + to->active_extensions = from->active_extensions; + to->extensions = from->extensions; + from->active_extensions = 0; +} + +static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, + const struct mptcp_ext *from_ext) +{ + /* MPTCP always clears the ext when adding it to the skb, so + * holes do not bother us here + */ + return !from_ext || + (to_ext && from_ext && + !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext))); +} + +/* check if skbs can be collapsed. + * MPTCP collapse is allowed if neither @to or @from carry an mptcp data + * mapping, or if the extension of @to is the same as @from. + * Collapsing is not possible if @to lacks an extension, but @from carries one. + */ +static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP), + skb_ext_find(from, SKB_EXT_MPTCP)); +} + +#else + +static inline void mptcp_skb_ext_move(struct sk_buff *to, + const struct sk_buff *from) +{ +} + +static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return true; +} + +#endif /* CONFIG_MPTCP */ #endif /* __NET_MPTCP_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index ac52633e7061..13bc83fab454 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -978,6 +979,13 @@ static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) return likely(!TCP_SKB_CB(skb)->eor); } +static inline bool tcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return likely(tcp_skb_can_collapse_to(to) && + mptcp_skb_can_collapse(to, from)); +} + /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ -- cgit v1.2.3 From 35b2c32116091ef87a15c604cac363da8322a288 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:21 -0800 Subject: tcp: Export TCP functions and ops struct MPTCP will make use of tcp_send_mss() and tcp_push() when sending data to specific TCP subflows. tcp_request_sock_ipvX_ops and ipvX_specific will be referenced during TCP subflow creation. Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 13bc83fab454..5e4133d09b9d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -330,6 +330,9 @@ int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); +int tcp_send_mss(struct sock *sk, int *size_goal, int flags); +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, + int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); @@ -2011,6 +2014,11 @@ struct tcp_request_sock_ops { enum tcp_synack_type synack_type); }; +extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; +#if IS_ENABLED(CONFIG_IPV6) +extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; +#endif + #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, -- cgit v1.2.3 From e66b2f31a068dd67172008459678821a79e4ea24 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jan 2020 07:59:23 -0800 Subject: tcp: clean ext on tx recycle Otherwise we will find stray/unexpected/old extensions value on next iteration. On tcp_write_xmit() we can end-up splitting an already queued skb in two parts, via tso_fragment(). The newly created skb can be allocated via the tx cache and an upper layer will not be aware of it, so that upper layer cannot set the ext properly. Resetting the ext on recycle ensures that stale data is not propagated in to packet headers or elsewhere. An alternative would be add an additional hook in tso_fragment() or in sk_stream_alloc_skb() to init the ext for upper layers that need it. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 8766f9bc3e70..432ff73d20f3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1464,6 +1464,7 @@ static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) sk_mem_uncharge(sk, skb->truesize); if (static_branch_unlikely(&tcp_tx_skb_cache_key) && !sk->sk_tx_skb_cache && !skb_cloned(skb)) { + skb_ext_reset(skb); skb_zcopy_clear(skb, true); sk->sk_tx_skb_cache = skb; return; -- cgit v1.2.3 From 468672b24fbc1c018e192dcc90e887bc9a9b2595 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 9 Jan 2020 14:46:09 -0800 Subject: devlink: add macro for "fw.psid" The "fw.psid" devlink info version is documented in devlink-info.rst, and used by one driver. However, there is no associated macro for this firmware version like there is for others. Add one now. Signed-off-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 453f45cc1519..4e80d9acdb86 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -485,6 +485,8 @@ enum devlink_param_generic_id { #define DEVLINK_INFO_VERSION_GENERIC_FW_UNDI "fw.undi" /* NCSI support/handler version */ #define DEVLINK_INFO_VERSION_GENERIC_FW_NCSI "fw.ncsi" +/* FW parameter set id */ +#define DEVLINK_INFO_VERSION_GENERIC_FW_PSID "fw.psid" struct devlink_region; struct devlink_info_req; -- cgit v1.2.3 From f4bdd7103652fab5ac8b0ed75fa5cbc515b50b8b Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 9 Jan 2020 14:46:10 -0800 Subject: devlink: move devlink documentation to subfolder Combine the documentation for devlink into a subfolder, and provide an index.rst file that can be used to generally describe devlink. Signed-off-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4e80d9acdb86..a6856f1d5d1f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -564,7 +564,7 @@ struct devlink_trap { }; /* All traps must be documented in - * Documentation/networking/devlink-trap.rst + * Documentation/networking/devlink/devlink-trap.rst */ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_SMAC_MC, @@ -598,7 +598,7 @@ enum devlink_trap_generic_id { }; /* All trap groups must be documented in - * Documentation/networking/devlink-trap.rst + * Documentation/networking/devlink/devlink-trap.rst */ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS, -- cgit v1.2.3 From c0e4eadfb8daf2e9557c7450f9b237c08b404419 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 13 Jan 2020 23:31:39 +0100 Subject: net: macsec: move some definitions in a dedicated header This patch moves some structure, type and identifier definitions into a MACsec specific header. This patch does not modify how the MACsec code is running and only move things around. This is a preparation for the future MACsec hardware offloading support, which will re-use those definitions outside macsec.c. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/net/macsec.h | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 include/net/macsec.h (limited to 'include/net') diff --git a/include/net/macsec.h b/include/net/macsec.h new file mode 100644 index 000000000000..e7b41c1043f6 --- /dev/null +++ b/include/net/macsec.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * MACsec netdev header, used for h/w accelerated implementations. + * + * Copyright (c) 2015 Sabrina Dubroca + */ +#ifndef _NET_MACSEC_H_ +#define _NET_MACSEC_H_ + +#include +#include +#include + +typedef u64 __bitwise sci_t; + +#define MACSEC_NUM_AN 4 /* 2 bits for the association number */ + +/** + * struct macsec_key - SA key + * @id: user-provided key identifier + * @tfm: crypto struct, key storage + */ +struct macsec_key { + u8 id[MACSEC_KEYID_LEN]; + struct crypto_aead *tfm; +}; + +struct macsec_rx_sc_stats { + __u64 InOctetsValidated; + __u64 InOctetsDecrypted; + __u64 InPktsUnchecked; + __u64 InPktsDelayed; + __u64 InPktsOK; + __u64 InPktsInvalid; + __u64 InPktsLate; + __u64 InPktsNotValid; + __u64 InPktsNotUsingSA; + __u64 InPktsUnusedSA; +}; + +struct macsec_rx_sa_stats { + __u32 InPktsOK; + __u32 InPktsInvalid; + __u32 InPktsNotValid; + __u32 InPktsNotUsingSA; + __u32 InPktsUnusedSA; +}; + +struct macsec_tx_sa_stats { + __u32 OutPktsProtected; + __u32 OutPktsEncrypted; +}; + +struct macsec_tx_sc_stats { + __u64 OutPktsProtected; + __u64 OutPktsEncrypted; + __u64 OutOctetsProtected; + __u64 OutOctetsEncrypted; +}; + +/** + * struct macsec_rx_sa - receive secure association + * @active: + * @next_pn: packet number expected for the next packet + * @lock: protects next_pn manipulations + * @key: key structure + * @stats: per-SA stats + */ +struct macsec_rx_sa { + struct macsec_key key; + spinlock_t lock; + u32 next_pn; + refcount_t refcnt; + bool active; + struct macsec_rx_sa_stats __percpu *stats; + struct macsec_rx_sc *sc; + struct rcu_head rcu; +}; + +struct pcpu_rx_sc_stats { + struct macsec_rx_sc_stats stats; + struct u64_stats_sync syncp; +}; + +struct pcpu_tx_sc_stats { + struct macsec_tx_sc_stats stats; + struct u64_stats_sync syncp; +}; + +/** + * struct macsec_rx_sc - receive secure channel + * @sci: secure channel identifier for this SC + * @active: channel is active + * @sa: array of secure associations + * @stats: per-SC stats + */ +struct macsec_rx_sc { + struct macsec_rx_sc __rcu *next; + sci_t sci; + bool active; + struct macsec_rx_sa __rcu *sa[MACSEC_NUM_AN]; + struct pcpu_rx_sc_stats __percpu *stats; + refcount_t refcnt; + struct rcu_head rcu_head; +}; + +/** + * struct macsec_tx_sa - transmit secure association + * @active: + * @next_pn: packet number to use for the next packet + * @lock: protects next_pn manipulations + * @key: key structure + * @stats: per-SA stats + */ +struct macsec_tx_sa { + struct macsec_key key; + spinlock_t lock; + u32 next_pn; + refcount_t refcnt; + bool active; + struct macsec_tx_sa_stats __percpu *stats; + struct rcu_head rcu; +}; + +/** + * struct macsec_tx_sc - transmit secure channel + * @active: + * @encoding_sa: association number of the SA currently in use + * @encrypt: encrypt packets on transmit, or authenticate only + * @send_sci: always include the SCI in the SecTAG + * @end_station: + * @scb: single copy broadcast flag + * @sa: array of secure associations + * @stats: stats for this TXSC + */ +struct macsec_tx_sc { + bool active; + u8 encoding_sa; + bool encrypt; + bool send_sci; + bool end_station; + bool scb; + struct macsec_tx_sa __rcu *sa[MACSEC_NUM_AN]; + struct pcpu_tx_sc_stats __percpu *stats; +}; + +/** + * struct macsec_secy - MACsec Security Entity + * @netdev: netdevice for this SecY + * @n_rx_sc: number of receive secure channels configured on this SecY + * @sci: secure channel identifier used for tx + * @key_len: length of keys used by the cipher suite + * @icv_len: length of ICV used by the cipher suite + * @validate_frames: validation mode + * @operational: MAC_Operational flag + * @protect_frames: enable protection for this SecY + * @replay_protect: enable packet number checks on receive + * @replay_window: size of the replay window + * @tx_sc: transmit secure channel + * @rx_sc: linked list of receive secure channels + */ +struct macsec_secy { + struct net_device *netdev; + unsigned int n_rx_sc; + sci_t sci; + u16 key_len; + u16 icv_len; + enum macsec_validation_type validate_frames; + bool operational; + bool protect_frames; + bool replay_protect; + u32 replay_window; + struct macsec_tx_sc tx_sc; + struct macsec_rx_sc __rcu *rx_sc; +}; + +#endif /* _NET_MACSEC_H_ */ -- cgit v1.2.3 From 76564261a7db80c5f5c624e0122a28787f266bdf Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 13 Jan 2020 23:31:40 +0100 Subject: net: macsec: introduce the macsec_context structure This patch introduces the macsec_context structure. It will be used in the kernel to exchange information between the common MACsec implementation (macsec.c) and the MACsec hardware offloading implementations. This structure contains pointers to MACsec specific structures which contain the actual MACsec configuration, and to the underlying device (phydev for now). Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/net/macsec.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/net') diff --git a/include/net/macsec.h b/include/net/macsec.h index e7b41c1043f6..0b98803f92ec 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -174,4 +174,25 @@ struct macsec_secy { struct macsec_rx_sc __rcu *rx_sc; }; +/** + * struct macsec_context - MACsec context for hardware offloading + */ +struct macsec_context { + struct phy_device *phydev; + enum macsec_offload offload; + + struct macsec_secy *secy; + struct macsec_rx_sc *rx_sc; + struct { + unsigned char assoc_num; + u8 key[MACSEC_KEYID_LEN]; + union { + struct macsec_rx_sa *rx_sa; + struct macsec_tx_sa *tx_sa; + }; + } sa; + + u8 prepare:1; +}; + #endif /* _NET_MACSEC_H_ */ -- cgit v1.2.3 From 0830e20b62ad156f7df5ff5b9c4cea280ebe8fef Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 13 Jan 2020 23:31:41 +0100 Subject: net: macsec: introduce MACsec ops This patch introduces MACsec ops for drivers to support offloading MACsec operations. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/net/macsec.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/net') diff --git a/include/net/macsec.h b/include/net/macsec.h index 0b98803f92ec..16e7e5061178 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -195,4 +195,28 @@ struct macsec_context { u8 prepare:1; }; +/** + * struct macsec_ops - MACsec offloading operations + */ +struct macsec_ops { + /* Device wide */ + int (*mdo_dev_open)(struct macsec_context *ctx); + int (*mdo_dev_stop)(struct macsec_context *ctx); + /* SecY */ + int (*mdo_add_secy)(struct macsec_context *ctx); + int (*mdo_upd_secy)(struct macsec_context *ctx); + int (*mdo_del_secy)(struct macsec_context *ctx); + /* Security channels */ + int (*mdo_add_rxsc)(struct macsec_context *ctx); + int (*mdo_upd_rxsc)(struct macsec_context *ctx); + int (*mdo_del_rxsc)(struct macsec_context *ctx); + /* Security associations */ + int (*mdo_add_rxsa)(struct macsec_context *ctx); + int (*mdo_upd_rxsa)(struct macsec_context *ctx); + int (*mdo_del_rxsa)(struct macsec_context *ctx); + int (*mdo_add_txsa)(struct macsec_context *ctx); + int (*mdo_upd_txsa)(struct macsec_context *ctx); + int (*mdo_del_txsa)(struct macsec_context *ctx); +}; + #endif /* _NET_MACSEC_H_ */ -- cgit v1.2.3 From 5c937de78b39e47ce9924fc4b863c5b727edc328 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 13 Jan 2020 23:31:47 +0100 Subject: net: macsec: PN wrap callback Allow to call macsec_pn_wrapped from hardware drivers to notify when a PN rolls over. Some drivers might used an interrupt to implement this. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/net/macsec.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/macsec.h b/include/net/macsec.h index 16e7e5061178..92e43db8b566 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -219,4 +219,6 @@ struct macsec_ops { int (*mdo_del_txsa)(struct macsec_context *ctx); }; +void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); + #endif /* _NET_MACSEC_H_ */ -- cgit v1.2.3 From 1e301fd04eaaa5b1e3c202450d86864e6714d783 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 14 Jan 2020 13:23:10 +0200 Subject: ipv4: Encapsulate function arguments in a struct fib_dump_info() is used to prepare RTM_{NEW,DEL}ROUTE netlink messages using the passed arguments. Currently, the function takes 11 arguments, 6 of which are attributes of the route being dumped (e.g., prefix, TOS). The next patch will need the function to also dump to user space an indication if the route is present in hardware or not. Instead of passing yet another argument, change the function to take a struct containing the different route attributes. v2: * Name last argument of fib_dump_info() * Move 'struct fib_rt_info' to include/net/ip_fib.h so that it could later be passed to fib_alias_hw_flags_set() Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b9cba41c6d4f..0c071c820e33 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -204,6 +204,15 @@ __be32 fib_result_prefsrc(struct net *net, struct fib_result *res); #define FIB_RES_DEV(res) (FIB_RES_NHC(res)->nhc_dev) #define FIB_RES_OIF(res) (FIB_RES_NHC(res)->nhc_oif) +struct fib_rt_info { + struct fib_info *fi; + u32 tb_id; + __be32 dst; + int dst_len; + u8 tos; + u8 type; +}; + struct fib_entry_notifier_info { struct fib_notifier_info info; /* must be first */ u32 dst; -- cgit v1.2.3 From 90b93f1b31f86dcde2fa3c57f1ae33d28d87a1f8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 14 Jan 2020 13:23:11 +0200 Subject: ipv4: Add "offload" and "trap" indications to routes When performing L3 offload, routes and nexthops are usually programmed into two different tables in the underlying device. Therefore, the fact that a nexthop resides in hardware does not necessarily mean that all the associated routes also reside in hardware and vice-versa. While the kernel can signal to user space the presence of a nexthop in hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag for routes. In addition, the fact that a route resides in hardware does not necessarily mean that the traffic is offloaded. For example, unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap packets to the CPU so that the kernel will be able to generate the appropriate ICMP error packet. This patch adds an "offload" and "trap" indications to IPv4 routes, so that users will have better visibility into the offload process. 'struct fib_alias' is extended with two new fields that indicate if the route resides in hardware or not and if it is offloading traffic from the kernel or trapping packets to it. Note that the new fields are added in the 6 bytes hole and therefore the struct still fits in a single cache line [1]. Capable drivers are expected to invoke fib_alias_hw_flags_set() with the route's key in order to set the flags. The indications are dumped to user space via a new flags (i.e., 'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the ancillary header. v2: * Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set() [1] struct fib_alias { struct hlist_node fa_list; /* 0 16 */ struct fib_info * fa_info; /* 16 8 */ u8 fa_tos; /* 24 1 */ u8 fa_type; /* 25 1 */ u8 fa_state; /* 26 1 */ u8 fa_slen; /* 27 1 */ u32 tb_id; /* 28 4 */ s16 fa_default; /* 32 2 */ u8 offload:1; /* 34: 0 1 */ u8 trap:1; /* 34: 1 1 */ u8 unused:6; /* 34: 2 1 */ /* XXX 5 bytes hole, try to pack */ struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */ /* size: 56, cachelines: 1, members: 12 */ /* sum members: 50, holes: 1, sum holes: 5 */ /* sum bitfield members: 8 bits (1 bytes) */ /* forced alignments: 1, forced holes: 1, sum forced holes: 5 */ /* last cacheline: 56 bytes */ } __attribute__((__aligned__(8))); Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 0c071c820e33..6a1ae49809de 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -211,6 +211,9 @@ struct fib_rt_info { int dst_len; u8 tos; u8 type; + u8 offload:1, + trap:1, + unused:6; }; struct fib_entry_notifier_info { @@ -473,6 +476,7 @@ int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *fc_encap, void fib_nh_common_release(struct fib_nh_common *nhc); /* Exported by fib_trie.c */ +void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri); void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); -- cgit v1.2.3 From bb3c4ab93e44784c1e958bdbba7824bba40f23cd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 14 Jan 2020 13:23:12 +0200 Subject: ipv6: Add "offload" and "trap" indications to routes In a similar fashion to previous patch, add "offload" and "trap" indication to IPv6 routes. This is done by using two unused bits in 'struct fib6_info' to hold these indications. Capable drivers are expected to set these when processing the various in-kernel route notifications. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index b579faea41e9..fd60a8ac02ee 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -192,7 +192,9 @@ struct fib6_info { dst_nopolicy:1, dst_host:1, fib6_destroying:1, - unused:3; + offload:1, + trap:1, + unused:1; struct rcu_head rcu; struct nexthop *nh; @@ -329,6 +331,13 @@ static inline void fib6_info_release(struct fib6_info *f6i) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } +static inline void fib6_info_hw_flags_set(struct fib6_info *f6i, bool offload, + bool trap) +{ + f6i->offload = offload; + f6i->trap = trap; +} + enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, -- cgit v1.2.3 From 600a87490ff9823d065fc15e86c709e707033ecc Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Tue, 7 Jan 2020 00:43:17 +0000 Subject: Bluetooth: Implementation of MGMT_OP_SET_BLOCKED_KEYS. MGMT command is added to receive the list of blocked keys from user-space. The list is used to: 1) Block keys from being distributed by the device during the ke distribution phase of SMP. 2) Filter out any keys that were previously saved so they are no longer used. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 10 ++++++++++ include/net/bluetooth/mgmt.h | 17 +++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index faebe3859931..89ecf0a80aa1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -118,6 +118,13 @@ struct bt_uuid { u8 svc_hint; }; +struct blocked_key { + struct list_head list; + struct rcu_head rcu; + u8 type; + u8 val[16]; +}; + struct smp_csrk { bdaddr_t bdaddr; u8 bdaddr_type; @@ -397,6 +404,7 @@ struct hci_dev { struct list_head le_conn_params; struct list_head pend_le_conns; struct list_head pend_le_reports; + struct list_head blocked_keys; struct hci_dev_stats stat; @@ -1123,6 +1131,8 @@ struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 val[16], bdaddr_t *rpa); void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type); +bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]); +void hci_blocked_keys_clear(struct hci_dev *hdev); void hci_smp_irks_clear(struct hci_dev *hdev); bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 9cee7ddc6741..a90666af05bd 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -654,6 +654,23 @@ struct mgmt_cp_set_phy_confguration { } __packed; #define MGMT_SET_PHY_CONFIGURATION_SIZE 4 +#define MGMT_OP_SET_BLOCKED_KEYS 0x0046 + +#define HCI_BLOCKED_KEY_TYPE_LINKKEY 0x00 +#define HCI_BLOCKED_KEY_TYPE_LTK 0x01 +#define HCI_BLOCKED_KEY_TYPE_IRK 0x02 + +struct mgmt_blocked_key_info { + __u8 type; + __u8 val[16]; +} __packed; + +struct mgmt_cp_set_blocked_keys { + __le16 key_count; + struct mgmt_blocked_key_info keys[0]; +} __packed; +#define MGMT_OP_SET_BLOCKED_KEYS_SIZE 2 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; -- cgit v1.2.3 From 4de0fc599eb936d37542f819e931ba3fd8e435ca Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 15 Jan 2020 13:02:11 -0800 Subject: Bluetooth: Add definitions for CIS connections These adds the HCI definitions for handling CIS connections along with ISO data packets. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 159 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 158 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 07b6ecedc6ce..6293bdd7d862 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -27,6 +27,7 @@ #define HCI_MAX_ACL_SIZE 1024 #define HCI_MAX_SCO_SIZE 255 +#define HCI_MAX_ISO_SIZE 251 #define HCI_MAX_EVENT_SIZE 260 #define HCI_MAX_FRAME_SIZE (HCI_MAX_ACL_SIZE + 4) @@ -303,6 +304,7 @@ enum { #define HCI_ACLDATA_PKT 0x02 #define HCI_SCODATA_PKT 0x03 #define HCI_EVENT_PKT 0x04 +#define HCI_ISODATA_PKT 0x05 #define HCI_DIAG_PKT 0xf0 #define HCI_VENDOR_PKT 0xff @@ -352,6 +354,15 @@ enum { #define ACL_ACTIVE_BCAST 0x04 #define ACL_PICO_BCAST 0x08 +/* ISO PB flags */ +#define ISO_START 0x00 +#define ISO_CONT 0x01 +#define ISO_SINGLE 0x02 +#define ISO_END 0x03 + +/* ISO TS flags */ +#define ISO_TS 0x01 + /* Baseband links */ #define SCO_LINK 0x00 #define ACL_LINK 0x01 @@ -359,6 +370,7 @@ enum { /* Low Energy links do not have defined link type. Use invented one */ #define LE_LINK 0x80 #define AMP_LINK 0x81 +#define ISO_LINK 0x82 #define INVALID_LINK 0xff /* LMP features */ @@ -440,6 +452,8 @@ enum { #define HCI_LE_PHY_2M 0x01 #define HCI_LE_PHY_CODED 0x08 #define HCI_LE_CHAN_SEL_ALG2 0x40 +#define HCI_LE_CIS_MASTER 0x10 +#define HCI_LE_CIS_SLAVE 0x20 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 @@ -1718,6 +1732,86 @@ struct hci_cp_le_set_adv_set_rand_addr { bdaddr_t bdaddr; } __packed; +#define HCI_OP_LE_READ_BUFFER_SIZE_V2 0x2060 +struct hci_rp_le_read_buffer_size_v2 { + __u8 status; + __le16 acl_mtu; + __u8 acl_max_pkt; + __le16 iso_mtu; + __u8 iso_max_pkt; +} __packed; + +#define HCI_OP_LE_READ_ISO_TX_SYNC 0x2061 +struct hci_cp_le_read_iso_tx_sync { + __le16 handle; +} __packed; + +struct hci_rp_le_read_iso_tx_sync { + __u8 status; + __le16 handle; + __le16 seq; + __le32 imestamp; + __u8 offset[3]; +} __packed; + +#define HCI_OP_LE_SET_CIG_PARAMS 0x2062 +struct hci_cis_params { + __u8 cis_id; + __le16 m_sdu; + __le16 s_sdu; + __u8 m_phy; + __u8 s_phy; + __u8 m_rtn; + __u8 s_rtn; +} __packed; + +struct hci_cp_le_set_cig_params { + __u8 cig_id; + __u8 m_interval[3]; + __u8 s_interval[3]; + __u8 sca; + __u8 packing; + __u8 framing; + __le16 m_latency; + __le16 s_latency; + __u8 num_cis; + struct hci_cis_params cis[0]; +} __packed; + +struct hci_rp_le_set_cig_params { + __u8 status; + __u8 cig_id; + __u8 num_handles; + __le16 handle[0]; +} __packed; + +#define HCI_OP_LE_CREATE_CIS 0x2064 +struct hci_cis { + __le16 cis_handle; + __le16 acl_handle; +} __packed; + +struct hci_cp_le_create_cis { + __u8 num_cis; + struct hci_cis cis[0]; +} __packed; + +#define HCI_OP_LE_REMOVE_CIG 0x2065 +struct hci_cp_le_remove_cig { + __u8 cig_id; +} __packed; + +#define HCI_OP_LE_ACCEPT_CIS 0x2066 +struct hci_cp_le_accept_cis { + __le16 handle; +} __packed; + +#define HCI_OP_LE_REJECT_CIS 0x2067 +struct hci_cp_le_reject_cis { + __le16 handle; + __u8 reason; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 @@ -2189,7 +2283,7 @@ struct hci_ev_le_direct_adv_info { #define HCI_EV_LE_PHY_UPDATE_COMPLETE 0x0c struct hci_ev_le_phy_update_complete { __u8 status; - __u16 handle; + __le16 handle; __u8 tx_phy; __u8 rx_phy; } __packed; @@ -2234,6 +2328,34 @@ struct hci_evt_le_ext_adv_set_term { __u8 num_evts; } __packed; +#define HCI_EVT_LE_CIS_ESTABLISHED 0x19 +struct hci_evt_le_cis_established { + __u8 status; + __le16 handle; + __u8 cig_sync_delay[3]; + __u8 cis_sync_delay[3]; + __u8 m_latency[3]; + __u8 s_latency[3]; + __u8 m_phy; + __u8 s_phy; + __u8 nse; + __u8 m_bn; + __u8 s_bn; + __u8 m_ft; + __u8 s_ft; + __le16 m_mtu; + __le16 s_mtu; + __le16 interval; +} __packed; + +#define HCI_EVT_LE_CIS_REQ 0x1a +struct hci_evt_le_cis_req { + __le16 acl_handle; + __le16 cis_handle; + __u8 cig_id; + __u8 cis_id; +} __packed; + #define HCI_EV_VENDOR 0xff /* Internal events generated by Bluetooth stack */ @@ -2262,6 +2384,7 @@ struct hci_ev_si_security { #define HCI_EVENT_HDR_SIZE 2 #define HCI_ACL_HDR_SIZE 4 #define HCI_SCO_HDR_SIZE 3 +#define HCI_ISO_HDR_SIZE 4 struct hci_command_hdr { __le16 opcode; /* OCF & OGF */ @@ -2283,6 +2406,30 @@ struct hci_sco_hdr { __u8 dlen; } __packed; +struct hci_iso_hdr { + __le16 handle; + __le16 dlen; + __u8 data[0]; +} __packed; + +/* ISO data packet status flags */ +#define HCI_ISO_STATUS_VALID 0x00 +#define HCI_ISO_STATUS_INVALID 0x01 +#define HCI_ISO_STATUS_NOP 0x02 + +#define HCI_ISO_DATA_HDR_SIZE 4 +struct hci_iso_data_hdr { + __le16 sn; + __le16 slen; +}; + +#define HCI_ISO_TS_DATA_HDR_SIZE 8 +struct hci_iso_ts_data_hdr { + __le32 ts; + __le16 sn; + __le16 slen; +}; + static inline struct hci_event_hdr *hci_event_hdr(const struct sk_buff *skb) { return (struct hci_event_hdr *) skb->data; @@ -2308,4 +2455,14 @@ static inline struct hci_sco_hdr *hci_sco_hdr(const struct sk_buff *skb) #define hci_handle(h) (h & 0x0fff) #define hci_flags(h) (h >> 12) +/* ISO handle and flags pack/unpack */ +#define hci_iso_flags_pb(f) (f & 0x0003) +#define hci_iso_flags_ts(f) ((f >> 2) & 0x0001) +#define hci_iso_flags_pack(pb, ts) ((pb & 0x03) | ((ts & 0x01) << 2)) + +/* ISO data length and flags pack/unpack */ +#define hci_iso_data_len_pack(h, f) ((__u16) ((h) | ((f) << 14))) +#define hci_iso_data_len(h) ((h) & 0x3fff) +#define hci_iso_data_flags(h) ((h) >> 14) + #endif /* __HCI_H */ -- cgit v1.2.3 From f9a619db7c137b7c2dec0414d8deb8ec762ae8f9 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 15 Jan 2020 13:02:17 -0800 Subject: Bluetooth: monitor: Add support for ISO packets This enables passing ISO packets to the monitor socket. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_mon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 240786b04a46..2d5fcda1bcd0 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -49,6 +49,8 @@ struct hci_mon_hdr { #define HCI_MON_CTRL_CLOSE 15 #define HCI_MON_CTRL_COMMAND 16 #define HCI_MON_CTRL_EVENT 17 +#define HCI_MON_ISO_TX_PKT 18 +#define HCI_MON_ISO_RX_PKT 19 struct hci_mon_new_index { __u8 type; -- cgit v1.2.3 From 445db8d09659eb27bcd5920cb91d91686f0197d0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 5 Jan 2020 22:00:57 +0100 Subject: netfilter: flowtable: remove dying bit, use teardown bit instead The dying bit removes the conntrack entry if the netdev that owns this flow is going down. Instead, use the teardown mechanism to push back the flow to conntrack to let the classic software path decide what to do with it. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 415b8f49d150..4ad924d5f983 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -85,7 +85,6 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_SNAT 0x1 #define FLOW_OFFLOAD_DNAT 0x2 -#define FLOW_OFFLOAD_DYING 0x4 #define FLOW_OFFLOAD_TEARDOWN 0x8 #define FLOW_OFFLOAD_HW 0x10 #define FLOW_OFFLOAD_HW_DYING 0x20 @@ -134,10 +133,6 @@ int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); void flow_offload_teardown(struct flow_offload *flow); -static inline void flow_offload_dead(struct flow_offload *flow) -{ - flow->flags |= FLOW_OFFLOAD_DYING; -} int nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, -- cgit v1.2.3 From 355a8b13f87a8964ebe785b065f1388a1bd00c7e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 5 Jan 2020 20:41:15 +0100 Subject: netfilter: flowtable: use atomic bitwise operations for flow flags Originally, all flow flag bits were set on only from the workqueue. With the introduction of the flow teardown state and hardware offload this is no longer true. Let's be safe and use atomic bitwise operation to operation with flow flags. Fixes: 59c466dd68e7 ("netfilter: nf_flow_table: add a new flow state for tearing down offloading") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 4ad924d5f983..5a10e28c3e40 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -83,12 +83,14 @@ struct flow_offload_tuple_rhash { struct flow_offload_tuple tuple; }; -#define FLOW_OFFLOAD_SNAT 0x1 -#define FLOW_OFFLOAD_DNAT 0x2 -#define FLOW_OFFLOAD_TEARDOWN 0x8 -#define FLOW_OFFLOAD_HW 0x10 -#define FLOW_OFFLOAD_HW_DYING 0x20 -#define FLOW_OFFLOAD_HW_DEAD 0x40 +enum nf_flow_flags { + NF_FLOW_SNAT, + NF_FLOW_DNAT, + NF_FLOW_TEARDOWN, + NF_FLOW_HW, + NF_FLOW_HW_DYING, + NF_FLOW_HW_DEAD, +}; enum flow_offload_type { NF_FLOW_OFFLOAD_UNSPEC = 0, @@ -98,7 +100,7 @@ enum flow_offload_type { struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; struct nf_conn *ct; - u16 flags; + unsigned long flags; u16 type; u32 timeout; struct rcu_head rcu_head; -- cgit v1.2.3 From a5449cdcaac5c78d62b8bea8f79158071f23da01 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 7 Jan 2020 09:56:27 +0100 Subject: netfilter: flowtable: add nf_flowtable_hw_offload() helper function This function checks for the NF_FLOWTABLE_HW_OFFLOAD flag, meaning that the flowtable hardware offload is enabled. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 5a10e28c3e40..9ee1eaeaab04 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -47,6 +47,11 @@ struct nf_flowtable { possible_net_t net; }; +static inline bool nf_flowtable_hw_offload(struct nf_flowtable *flowtable) +{ + return flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD; +} + enum flow_offload_tuple_dir { FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, -- cgit v1.2.3 From f698fe40829b21088d323c8b0a7c626571528fc6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 6 Jan 2020 12:56:47 +0100 Subject: netfilter: flowtable: refresh flow if hardware offload fails If nf_flow_offload_add() fails to add the flow to hardware, then the NF_FLOW_HW_REFRESH flag bit is set and the flow remains in the flowtable software path. If flowtable hardware offload is enabled, this patch enqueues a new request to offload this flow to hardware. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 9ee1eaeaab04..e0f709d9d547 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -95,6 +95,7 @@ enum nf_flow_flags { NF_FLOW_HW, NF_FLOW_HW_DYING, NF_FLOW_HW_DEAD, + NF_FLOW_HW_REFRESH, }; enum flow_offload_type { -- cgit v1.2.3 From 56f200c78ce4d94680a27a1ce97a29ebeb4f23e1 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 16 Jan 2020 21:16:46 +0100 Subject: netns: Constify exported functions Mark function parameters as 'const' where possible. Signed-off-by: Guillaume Nault Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index b8ceaf0cd997..854d39ef1ca3 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -347,9 +347,9 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp); -int peernet2id(struct net *net, struct net *peer); -bool peernet_has_id(struct net *net, struct net *peer); -struct net *get_net_ns_by_id(struct net *net, int id); +int peernet2id(const struct net *net, struct net *peer); +bool peernet_has_id(const struct net *net, struct net *peer); +struct net *get_net_ns_by_id(const struct net *net, int id); struct pernet_operations { struct list_head list; @@ -427,7 +427,7 @@ static inline void unregister_net_sysctl_table(struct ctl_table_header *header) } #endif -static inline int rt_genid_ipv4(struct net *net) +static inline int rt_genid_ipv4(const struct net *net) { return atomic_read(&net->ipv4.rt_genid); } @@ -459,7 +459,7 @@ static inline void rt_genid_bump_all(struct net *net) rt_genid_bump_ipv6(net); } -static inline int fnhe_genid(struct net *net) +static inline int fnhe_genid(const struct net *net) { return atomic_read(&net->fnhe_genid); } -- cgit v1.2.3 From 95f0ead8f04bec18e474594ef585f3734bd85b4c Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 19 Jan 2020 15:00:48 +0200 Subject: devlink: Add non-routable packet trap Add packet trap that can report packets that reached the router, but are non-routable. For example, IGMP queries can be flooded by the device in layer 2 and reach the router. Such packets should not be routed and instead dropped. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index a6856f1d5d1f..08b757753e1c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -591,6 +591,7 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_REJECT_ROUTE, DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, + DEVLINK_TRAP_GENERIC_ID_NON_ROUTABLE, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -659,6 +660,8 @@ enum devlink_trap_group_generic_id { "ipv4_lpm_miss" #define DEVLINK_TRAP_GENERIC_NAME_IPV6_LPM_UNICAST_MISS \ "ipv6_lpm_miss" +#define DEVLINK_TRAP_GENERIC_NAME_NON_ROUTABLE \ + "non_routable_packet" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" -- cgit v1.2.3 From 13c056ec7d006b11557cebd9f1803edd646d2876 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 19 Jan 2020 15:00:54 +0200 Subject: devlink: Add tunnel generic packet traps Add packet traps that can report packets that were dropped during tunnel decapsulation. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 08b757753e1c..455282a4b714 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -592,6 +592,7 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_NON_ROUTABLE, + DEVLINK_TRAP_GENERIC_ID_DECAP_ERROR, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -605,6 +606,7 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_L3_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -662,6 +664,8 @@ enum devlink_trap_group_generic_id { "ipv6_lpm_miss" #define DEVLINK_TRAP_GENERIC_NAME_NON_ROUTABLE \ "non_routable_packet" +#define DEVLINK_TRAP_GENERIC_NAME_DECAP_ERROR \ + "decap_error" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -669,6 +673,8 @@ enum devlink_trap_group_generic_id { "l3_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_BUFFER_DROPS \ "buffer_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_TUNNEL_DROPS \ + "tunnel_drops" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group, _metadata_cap) \ { \ -- cgit v1.2.3 From c3cae4916e57d2f0364d5e7769218547fb1b7c60 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 19 Jan 2020 15:00:58 +0200 Subject: devlink: Add overlay source MAC is multicast trap Add packet trap that can report NVE packets that the device decided to drop because their overlay source MAC is multicast. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 455282a4b714..2813fd06ee89 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -593,6 +593,7 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_NON_ROUTABLE, DEVLINK_TRAP_GENERIC_ID_DECAP_ERROR, + DEVLINK_TRAP_GENERIC_ID_OVERLAY_SMAC_MC, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -666,6 +667,8 @@ enum devlink_trap_group_generic_id { "non_routable_packet" #define DEVLINK_TRAP_GENERIC_NAME_DECAP_ERROR \ "decap_error" +#define DEVLINK_TRAP_GENERIC_NAME_OVERLAY_SMAC_MC \ + "overlay_smac_is_mc" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" -- cgit v1.2.3 From 43a825afc91e2b06af1e8e7422198e759c2c5e20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 20 Jan 2020 10:29:17 +0100 Subject: xsk, net: Make sock_def_readable() have external linkage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XDP sockets use the default implementation of struct sock's sk_data_ready callback, which is sock_def_readable(). This function is called in the XDP socket fast-path, and involves a retpoline. By letting sock_def_readable() have external linkage, and being called directly, the retpoline can be avoided. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200120092917.13949-1-bjorn.topel@gmail.com --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 8dff68b4c316..0891c55f1e82 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2612,4 +2612,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) return false; } +void sock_def_readable(struct sock *sk); + #endif /* _SOCK_H */ -- cgit v1.2.3 From 84bf557fb02f5924c109a21a160ffc353d878487 Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:24 +0530 Subject: net: sched: pie: move common code to pie.h This patch moves macros, structures and small functions common to PIE and FQ-PIE (to be added in a future commit) from the file net/sched/sch_pie.c to the header file include/net/pie.h. All the moved functions are made inline. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 include/net/pie.h (limited to 'include/net') diff --git a/include/net/pie.h b/include/net/pie.h new file mode 100644 index 000000000000..440213ec83eb --- /dev/null +++ b/include/net/pie.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __NET_SCHED_PIE_H +#define __NET_SCHED_PIE_H + +#include +#include +#include +#include +#include + +#define QUEUE_THRESHOLD 16384 +#define DQCOUNT_INVALID -1 +#define DTIME_INVALID 0xffffffffffffffff +#define MAX_PROB 0xffffffffffffffff +#define PIE_SCALE 8 + +/* parameters used */ +struct pie_params { + psched_time_t target; /* user specified target delay in pschedtime */ + u32 tupdate; /* timer frequency (in jiffies) */ + u32 limit; /* number of packets that can be enqueued */ + u32 alpha; /* alpha and beta are between 0 and 32 */ + u32 beta; /* and are used for shift relative to 1 */ + bool ecn; /* true if ecn is enabled */ + bool bytemode; /* to scale drop early prob based on pkt size */ + u8 dq_rate_estimator; /* to calculate delay using Little's law */ +}; + +/* variables used */ +struct pie_vars { + u64 prob; /* probability but scaled by u64 limit. */ + psched_time_t burst_time; + psched_time_t qdelay; + psched_time_t qdelay_old; + u64 dq_count; /* measured in bytes */ + psched_time_t dq_tstamp; /* drain rate */ + u64 accu_prob; /* accumulated drop probability */ + u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ + u32 qlen_old; /* in bytes */ + u8 accu_prob_overflows; /* overflows of accu_prob */ +}; + +/* statistics gathering */ +struct pie_stats { + u32 packets_in; /* total number of packets enqueued */ + u32 dropped; /* packets dropped due to pie_action */ + u32 overlimit; /* dropped due to lack of space in queue */ + u32 maxq; /* maximum queue size */ + u32 ecn_mark; /* packets marked with ECN */ +}; + +/* private skb vars */ +struct pie_skb_cb { + psched_time_t enqueue_time; +}; + +static inline void pie_params_init(struct pie_params *params) +{ + params->alpha = 2; + params->beta = 20; + params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ + params->limit = 1000; /* default of 1000 packets */ + params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ + params->ecn = false; + params->bytemode = false; + params->dq_rate_estimator = false; +} + +static inline void pie_vars_init(struct pie_vars *vars) +{ + vars->dq_count = DQCOUNT_INVALID; + vars->dq_tstamp = DTIME_INVALID; + vars->accu_prob = 0; + vars->avg_dq_rate = 0; + /* default of 150 ms in pschedtime */ + vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); + vars->accu_prob_overflows = 0; +} + +static inline struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb)); + return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static inline psched_time_t pie_get_enqueue_time(const struct sk_buff *skb) +{ + return get_pie_cb(skb)->enqueue_time; +} + +static inline void pie_set_enqueue_time(struct sk_buff *skb) +{ + get_pie_cb(skb)->enqueue_time = psched_get_time(); +} + +#endif -- cgit v1.2.3 From 805a5a23a4c4ee126e781bd14a1deb242395e817 Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:25 +0530 Subject: pie: use U64_MAX to denote (2^64 - 1) Use the U64_MAX macro to denote the constant (2^64 - 1). Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pie.h b/include/net/pie.h index 440213ec83eb..7ef375db5bab 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -10,8 +10,8 @@ #define QUEUE_THRESHOLD 16384 #define DQCOUNT_INVALID -1 -#define DTIME_INVALID 0xffffffffffffffff -#define MAX_PROB 0xffffffffffffffff +#define DTIME_INVALID U64_MAX +#define MAX_PROB U64_MAX #define PIE_SCALE 8 /* parameters used */ -- cgit v1.2.3 From cf4eeee5ff56180b525bfb6a204071216ca4000a Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:26 +0530 Subject: pie: rearrange macros in order of length Rearrange macros in order of length and align the values to improve readability. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/pie.h b/include/net/pie.h index 7ef375db5bab..397c7abf0879 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -8,11 +8,11 @@ #include #include -#define QUEUE_THRESHOLD 16384 -#define DQCOUNT_INVALID -1 -#define DTIME_INVALID U64_MAX -#define MAX_PROB U64_MAX -#define PIE_SCALE 8 +#define MAX_PROB U64_MAX +#define DTIME_INVALID U64_MAX +#define QUEUE_THRESHOLD 16384 +#define DQCOUNT_INVALID -1 +#define PIE_SCALE 8 /* parameters used */ struct pie_params { -- cgit v1.2.3 From 1dbfc5e071db3f5acc3c7c87a564bf57b838cf49 Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:27 +0530 Subject: pie: use u8 instead of bool in pie_vars Linux best practice recommends using u8 for true/false values in structures. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pie.h b/include/net/pie.h index 397c7abf0879..f9c6a44bdb0c 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -21,8 +21,8 @@ struct pie_params { u32 limit; /* number of packets that can be enqueued */ u32 alpha; /* alpha and beta are between 0 and 32 */ u32 beta; /* and are used for shift relative to 1 */ - bool ecn; /* true if ecn is enabled */ - bool bytemode; /* to scale drop early prob based on pkt size */ + u8 ecn; /* true if ecn is enabled */ + u8 bytemode; /* to scale drop early prob based on pkt size */ u8 dq_rate_estimator; /* to calculate delay using Little's law */ }; -- cgit v1.2.3 From 2dfb1952a9a1fde0b515f58605c11902e69415bf Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:28 +0530 Subject: pie: rearrange structure members and their initializations Rearrange the members of the structure such that closely referenced members appear together and/or fit in the same cacheline. Also, change the order of their initializations to match the order in which they appear in the structure. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/pie.h b/include/net/pie.h index f9c6a44bdb0c..ec0fbe98ec2f 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -28,13 +28,13 @@ struct pie_params { /* variables used */ struct pie_vars { - u64 prob; /* probability but scaled by u64 limit. */ - psched_time_t burst_time; psched_time_t qdelay; psched_time_t qdelay_old; - u64 dq_count; /* measured in bytes */ + psched_time_t burst_time; psched_time_t dq_tstamp; /* drain rate */ + u64 prob; /* probability but scaled by u64 limit. */ u64 accu_prob; /* accumulated drop probability */ + u64 dq_count; /* measured in bytes */ u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ u32 qlen_old; /* in bytes */ u8 accu_prob_overflows; /* overflows of accu_prob */ @@ -45,8 +45,8 @@ struct pie_stats { u32 packets_in; /* total number of packets enqueued */ u32 dropped; /* packets dropped due to pie_action */ u32 overlimit; /* dropped due to lack of space in queue */ - u32 maxq; /* maximum queue size */ u32 ecn_mark; /* packets marked with ECN */ + u32 maxq; /* maximum queue size */ }; /* private skb vars */ @@ -56,11 +56,11 @@ struct pie_skb_cb { static inline void pie_params_init(struct pie_params *params) { - params->alpha = 2; - params->beta = 20; + params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ params->limit = 1000; /* default of 1000 packets */ - params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ + params->alpha = 2; + params->beta = 20; params->ecn = false; params->bytemode = false; params->dq_rate_estimator = false; @@ -68,12 +68,12 @@ static inline void pie_params_init(struct pie_params *params) static inline void pie_vars_init(struct pie_vars *vars) { - vars->dq_count = DQCOUNT_INVALID; + /* default of 150 ms in pschedtime */ + vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); vars->dq_tstamp = DTIME_INVALID; vars->accu_prob = 0; + vars->dq_count = DQCOUNT_INVALID; vars->avg_dq_rate = 0; - /* default of 150 ms in pschedtime */ - vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); vars->accu_prob_overflows = 0; } -- cgit v1.2.3 From b42a3d7c7cfff3555d7057c20dbbe57fe75d77c0 Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:29 +0530 Subject: pie: improve comments and commenting style Improve the comments along with the commenting style used to describe the members of the structures and their initial values in the init functions. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 85 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 27 deletions(-) (limited to 'include/net') diff --git a/include/net/pie.h b/include/net/pie.h index ec0fbe98ec2f..51a1984c2dce 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -14,42 +14,74 @@ #define DQCOUNT_INVALID -1 #define PIE_SCALE 8 -/* parameters used */ +/** + * struct pie_params - contains pie parameters + * @target: target delay in pschedtime + * @tudpate: interval at which drop probability is calculated + * @limit: total number of packets that can be in the queue + * @alpha: parameter to control drop probability + * @beta: parameter to control drop probability + * @ecn: is ECN marking of packets enabled + * @bytemode: is drop probability scaled based on pkt size + * @dq_rate_estimator: is Little's law used for qdelay calculation + */ struct pie_params { - psched_time_t target; /* user specified target delay in pschedtime */ - u32 tupdate; /* timer frequency (in jiffies) */ - u32 limit; /* number of packets that can be enqueued */ - u32 alpha; /* alpha and beta are between 0 and 32 */ - u32 beta; /* and are used for shift relative to 1 */ - u8 ecn; /* true if ecn is enabled */ - u8 bytemode; /* to scale drop early prob based on pkt size */ - u8 dq_rate_estimator; /* to calculate delay using Little's law */ + psched_time_t target; + u32 tupdate; + u32 limit; + u32 alpha; + u32 beta; + u8 ecn; + u8 bytemode; + u8 dq_rate_estimator; }; -/* variables used */ +/** + * struct pie_vars - contains pie variables + * @qdelay: current queue delay + * @qdelay_old: queue delay in previous qdelay calculation + * @burst_time: burst time allowance + * @dq_tstamp: timestamp at which dq rate was last calculated + * @prob: drop probability + * @accu_prob: accumulated drop probability + * @dq_count: number of bytes dequeued in a measurement cycle + * @avg_dq_rate: calculated average dq rate + * @qlen_old: queue length during previous qdelay calculation + * @accu_prob_overflows: number of times accu_prob overflows + */ struct pie_vars { psched_time_t qdelay; psched_time_t qdelay_old; psched_time_t burst_time; - psched_time_t dq_tstamp; /* drain rate */ - u64 prob; /* probability but scaled by u64 limit. */ - u64 accu_prob; /* accumulated drop probability */ - u64 dq_count; /* measured in bytes */ - u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ - u32 qlen_old; /* in bytes */ - u8 accu_prob_overflows; /* overflows of accu_prob */ + psched_time_t dq_tstamp; + u64 prob; + u64 accu_prob; + u64 dq_count; + u32 avg_dq_rate; + u32 qlen_old; + u8 accu_prob_overflows; }; -/* statistics gathering */ +/** + * struct pie_stats - contains pie stats + * @packets_in: total number of packets enqueued + * @dropped: packets dropped due to pie action + * @overlimit: packets dropped due to lack of space in queue + * @ecn_mark: packets marked with ECN + * @maxq: maximum queue size + */ struct pie_stats { - u32 packets_in; /* total number of packets enqueued */ - u32 dropped; /* packets dropped due to pie_action */ - u32 overlimit; /* dropped due to lack of space in queue */ - u32 ecn_mark; /* packets marked with ECN */ - u32 maxq; /* maximum queue size */ + u32 packets_in; + u32 dropped; + u32 overlimit; + u32 ecn_mark; + u32 maxq; }; -/* private skb vars */ +/** + * struct pie_skb_cb - contains private skb vars + * @enqueue_time: timestamp when the packet is enqueued + */ struct pie_skb_cb { psched_time_t enqueue_time; }; @@ -58,7 +90,7 @@ static inline void pie_params_init(struct pie_params *params) { params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ - params->limit = 1000; /* default of 1000 packets */ + params->limit = 1000; params->alpha = 2; params->beta = 20; params->ecn = false; @@ -68,8 +100,7 @@ static inline void pie_params_init(struct pie_params *params) static inline void pie_vars_init(struct pie_vars *vars) { - /* default of 150 ms in pschedtime */ - vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); + vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); /* 150 ms */ vars->dq_tstamp = DTIME_INVALID; vars->accu_prob = 0; vars->dq_count = DQCOUNT_INVALID; -- cgit v1.2.3 From 5205ea00cda1ac23cebfb97dfccca84722d58dfe Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:32 +0530 Subject: net: sched: pie: export symbols to be reused by FQ-PIE This patch makes the drop_early(), calculate_probability() and pie_process_dequeue() functions generic enough to be used by both PIE and FQ-PIE (to be added in a future commit). The major change here is in the way the functions take in arguments. This patch exports these functions and makes FQ-PIE dependent on sch_pie. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/pie.h b/include/net/pie.h index 51a1984c2dce..90f5db3d29e7 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -124,4 +124,13 @@ static inline void pie_set_enqueue_time(struct sk_buff *skb) get_pie_cb(skb)->enqueue_time = psched_get_time(); } +bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, + struct pie_vars *vars, u32 qlen, u32 packet_size); + +void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, + struct pie_vars *vars, u32 qlen); + +void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, + u32 qlen); + #endif -- cgit v1.2.3 From ec97ecf1ebe485a17cd8395a5f35e6b80b57665a Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:33 +0530 Subject: net: sched: add Flow Queue PIE packet scheduler Principles: - Packets are classified on flows. - This is a Stochastic model (as we use a hash, several flows might be hashed to the same slot) - Each flow has a PIE managed queue. - Flows are linked onto two (Round Robin) lists, so that new flows have priority on old ones. - For a given flow, packets are not reordered. - Drops during enqueue only. - ECN capability is off by default. - ECN threshold (if ECN is enabled) is at 10% by default. - Uses timestamps to calculate queue delay by default. Usage: tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ] [ target TIME ] [ tupdate TIME ] [ alpha NUMBER ] [ beta NUMBER ] [ quantum BYTES ] [ memory_limit BYTES ] [ ecnprob PERCENTAGE ] [ [no]ecn ] [ [no]bytemode ] [ [no_]dq_rate_estimator ] defaults: limit: 10240 packets, flows: 1024 target: 15 ms, tupdate: 15 ms (in jiffies) alpha: 1/8, beta : 5/4 quantum: device MTU, memory_limit: 32 Mb ecnprob: 10%, ecn: off bytemode: off, dq_rate_estimator: off Signed-off-by: Mohit P. Tahiliani Signed-off-by: Sachin D. Patil Signed-off-by: V. Saicharan Signed-off-by: Mohit Bhasi Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/pie.h b/include/net/pie.h index 90f5db3d29e7..fd5a37cb7993 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -81,9 +81,11 @@ struct pie_stats { /** * struct pie_skb_cb - contains private skb vars * @enqueue_time: timestamp when the packet is enqueued + * @mem_usage: size of the skb during enqueue */ struct pie_skb_cb { psched_time_t enqueue_time; + u32 mem_usage; }; static inline void pie_params_init(struct pie_params *params) -- cgit v1.2.3 From f870fa0b5768842cb4690c1c11f19f28b731ae6d Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 21 Jan 2020 16:56:15 -0800 Subject: mptcp: Add MPTCP socket stubs Implements the infrastructure for MPTCP sockets. MPTCP sockets open one in-kernel TCP socket per subflow. These subflow sockets are only managed by the MPTCP socket that owns them and are not visible from userspace. This commit allows a userspace program to open an MPTCP socket with: sock = socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); The resulting socket is simply a wrapper around a single regular TCP socket, without any of the MPTCP protocol implemented over the wire. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/mptcp.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 0573ae75c3db..98ba22379117 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -28,6 +28,8 @@ struct mptcp_ext { #ifdef CONFIG_MPTCP +void mptcp_init(void); + /* move the skb extension owership, with the assumption that 'to' is * newly allocated */ @@ -70,6 +72,10 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, #else +static inline void mptcp_init(void) +{ +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { @@ -82,4 +88,14 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, } #endif /* CONFIG_MPTCP */ + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int mptcpv6_init(void); +#elif IS_ENABLED(CONFIG_IPV6) +static inline int mptcpv6_init(void) +{ + return 0; +} +#endif + #endif /* __NET_MPTCP_H */ -- cgit v1.2.3 From eda7acddf8080bb2d022a8d4b8b2345eb80c63ec Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:16 -0800 Subject: mptcp: Handle MPTCP TCP options Add hooks to parse and format the MP_CAPABLE option. This option is handled according to MPTCP version 0 (RFC6824). MPTCP version 1 MP_CAPABLE (RFC6824bis/RFC8684) will be added later in coordination with related code changes. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/mptcp.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 98ba22379117..3daec2ceb3ff 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -9,6 +9,7 @@ #define __NET_MPTCP_H #include +#include #include /* MPTCP sk_buff extension data */ @@ -26,10 +27,22 @@ struct mptcp_ext { /* one byte hole */ }; +struct mptcp_out_options { +#if IS_ENABLED(CONFIG_MPTCP) + u16 suboptions; + u64 sndr_key; + u64 rcvr_key; +#endif +}; + #ifdef CONFIG_MPTCP void mptcp_init(void); +void mptcp_parse_option(const unsigned char *ptr, int opsize, + struct tcp_options_received *opt_rx); +void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); + /* move the skb extension owership, with the assumption that 'to' is * newly allocated */ @@ -76,6 +89,11 @@ static inline void mptcp_init(void) { } +static inline void mptcp_parse_option(const unsigned char *ptr, int opsize, + struct tcp_options_received *opt_rx) +{ +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { -- cgit v1.2.3 From cec37a6e41aae7bf3df9a3da783380a4d9325fd8 Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:18 -0800 Subject: mptcp: Handle MP_CAPABLE options for outgoing connections Add hooks to tcp_output.c to add MP_CAPABLE to an outgoing SYN request, to capture the MP_CAPABLE in the received SYN-ACK, to add MP_CAPABLE to the final ACK of the three-way handshake. Use the .sk_rx_dst_set() handler in the subflow proto to capture when the responding SYN-ACK is received and notify the MPTCP connection layer. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/mptcp.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 3daec2ceb3ff..eabc57c3fde4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -39,8 +39,27 @@ struct mptcp_out_options { void mptcp_init(void); +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return tcp_sk(sk)->is_mptcp; +} + +static inline bool rsk_is_mptcp(const struct request_sock *req) +{ + return tcp_rsk(req)->is_mptcp; +} + void mptcp_parse_option(const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx); +bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts); +void mptcp_rcv_synsent(struct sock *sk); +bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, + struct mptcp_out_options *opts); +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts); + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); /* move the skb extension owership, with the assumption that 'to' is @@ -89,11 +108,47 @@ static inline void mptcp_init(void) { } +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return false; +} + +static inline bool rsk_is_mptcp(const struct request_sock *req) +{ + return false; +} + static inline void mptcp_parse_option(const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { } +static inline bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline void mptcp_rcv_synsent(struct sock *sk) +{ +} + +static inline bool mptcp_synack_options(const struct request_sock *req, + unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline bool mptcp_established_options(struct sock *sk, + struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + return false; +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { @@ -107,6 +162,8 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, #endif /* CONFIG_MPTCP */ +void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped); + #if IS_ENABLED(CONFIG_MPTCP_IPV6) int mptcpv6_init(void); #elif IS_ENABLED(CONFIG_IPV6) -- cgit v1.2.3 From 6d0060f600adfddaa43fefb96b6b12643331961e Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 21 Jan 2020 16:56:23 -0800 Subject: mptcp: Write MPTCP DSS headers to outgoing data packets Per-packet metadata required to write the MPTCP DSS option is written to the skb_ext area. One write to the socket may contain more than one packet of data, which is copied to page fragments and mapped in to MPTCP DSS segments with size determined by the available page fragments and the maximum mapping length allowed by the MPTCP specification. If do_tcp_sendpages() splits a DSS segment in to multiple skbs, that's ok - the later skbs can either have duplicated DSS mapping information or none at all, and the receiver can handle that. The current implementation uses the subflow frag cache and tcp sendpages to avoid excessive code duplication. More work is required to ensure that it works correctly under memory pressure and to support MPTCP-level retransmissions. The MPTCP DSS checksum is not yet implemented. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/mptcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index eabc57c3fde4..06dcc665135e 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -32,6 +32,7 @@ struct mptcp_out_options { u16 suboptions; u64 sndr_key; u64 rcvr_key; + struct mptcp_ext ext_copy; #endif }; -- cgit v1.2.3 From 648ef4b88673dadb8463bf0d4b10fbf33d55def8 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 21 Jan 2020 16:56:24 -0800 Subject: mptcp: Implement MPTCP receive path Parses incoming DSS options and populates outgoing MPTCP ACK fields. MPTCP fields are parsed from the TCP option header and placed in an skb extension, allowing the upper MPTCP layer to access MPTCP options after the skb has gone through the TCP stack. The subflow implements its own data_ready() ops, which ensures that the pending data is in sequence - according to MPTCP seq number - dropping out-of-seq skbs. The DATA_READY bit flag is set if this is the case. This allows the MPTCP socket layer to determine if more data is available without having to consult the individual subflows. It additionally validates the current mapping and propagates EoF events to the connection socket. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/mptcp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 06dcc665135e..8619c1fca741 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -60,6 +60,8 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *opt_rx); void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); @@ -150,6 +152,12 @@ static inline bool mptcp_established_options(struct sock *sk, return false; } +static inline void mptcp_incoming_options(struct sock *sk, + struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { -- cgit v1.2.3 From cc7972ea1932335e0a0ee00ac8a24b3e8304630d Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 21 Jan 2020 16:56:31 -0800 Subject: mptcp: parse and emit MP_CAPABLE option according to v1 spec This implements MP_CAPABLE options parsing and writing according to RFC 6824 bis / RFC 8684: MPTCP v1. Local key is sent on syn/ack, and both keys are sent on 3rd ack. MP_CAPABLE messages len are updated accordingly. We need the skbuff to correctly emit the above, so we push the skbuff struct as an argument all the way from tcp code to the relevant mptcp callbacks. When processing incoming MP_CAPABLE + data, build a full blown DSS-like map info, to simplify later processing. On child socket creation, we need to record the remote key, if available. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/mptcp.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 8619c1fca741..27627e2d1bc2 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -23,7 +23,8 @@ struct mptcp_ext { data_fin:1, use_ack:1, ack64:1, - __unused:3; + mpc_map:1, + __unused:2; /* one byte hole */ }; @@ -50,10 +51,10 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return tcp_rsk(req)->is_mptcp; } -void mptcp_parse_option(const unsigned char *ptr, int opsize, - struct tcp_options_received *opt_rx); -bool mptcp_syn_options(struct sock *sk, unsigned int *size, - struct mptcp_out_options *opts); +void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, + int opsize, struct tcp_options_received *opt_rx); +bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts); void mptcp_rcv_synsent(struct sock *sk); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts); @@ -121,12 +122,14 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return false; } -static inline void mptcp_parse_option(const unsigned char *ptr, int opsize, +static inline void mptcp_parse_option(const struct sk_buff *skb, + const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { } -static inline bool mptcp_syn_options(struct sock *sk, unsigned int *size, +static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts) { return false; -- cgit v1.2.3 From e42f1ac626e7f799717d006e0f8393b6d6f9fc8c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 24 Jan 2020 16:04:02 -0800 Subject: mptcp: do not inherit inet proto ops We need to initialise the struct ourselves, else we expose tcp-specific callbacks such as tcp_splice_read which will then trigger splat because the socket is an mptcp one: BUG: KASAN: slab-out-of-bounds in tcp_mstamp_refresh+0x80/0xa0 net/ipv4/tcp_output.c:57 Write of size 8 at addr ffff888116aa21d0 by task syz-executor.0/5478 CPU: 1 PID: 5478 Comm: syz-executor.0 Not tainted 5.5.0-rc6 #3 Call Trace: tcp_mstamp_refresh+0x80/0xa0 net/ipv4/tcp_output.c:57 tcp_rcv_space_adjust+0x72/0x7f0 net/ipv4/tcp_input.c:612 tcp_read_sock+0x622/0x990 net/ipv4/tcp.c:1674 tcp_splice_read+0x20b/0xb40 net/ipv4/tcp.c:791 do_splice+0x1259/0x1560 fs/splice.c:1205 To prevent build error with ipv6, add the recv/sendmsg function declaration to ipv6.h. The functions are already accessible "thanks" to retpoline related work, but they are currently only made visible by socket.c specific INDIRECT_CALLABLE macros. Reported-by: Christoph Paasch Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/ipv6.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4e95f6df508c..cec1a54401f2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1113,6 +1113,9 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); +int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); +int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags); /* * reassembly.c -- cgit v1.2.3 From ef6aadcc76c97e25f62adc4e9d19684d3e5d0b87 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 24 Jan 2020 15:23:06 +0200 Subject: net: sched: Make TBF Qdisc offloadable Invoke ndo_setup_tc as appropriate to signal init / replacement, destroying and dumping of TBF Qdisc. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 47b115e2012a..ce036492986a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -854,4 +854,26 @@ struct tc_ets_qopt_offload { }; }; +enum tc_tbf_command { + TC_TBF_REPLACE, + TC_TBF_DESTROY, + TC_TBF_STATS, +}; + +struct tc_tbf_qopt_offload_replace_params { + struct psched_ratecfg rate; + u32 max_size; + struct gnet_stats_queue *qstats; +}; + +struct tc_tbf_qopt_offload { + enum tc_tbf_command command; + u32 handle; + u32 parent; + union { + struct tc_tbf_qopt_offload_replace_params replace_params; + struct tc_qopt_offload_stats stats; + }; +}; + #endif -- cgit v1.2.3 From 7b225d0b5c6dda5fefab578175f210c6fc7e389a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 22 Jan 2020 00:17:52 +0100 Subject: netfilter: nf_tables: add NFTA_SET_ELEM_KEY_END attribute Add NFTA_SET_ELEM_KEY_END attribute to convey the closing element of the interval between kernel and userspace. This patch also adds the NFT_SET_EXT_KEY_END extension to store the closing element value in this interval. v4: No changes v3: New patch [sbrivio: refactor error paths and labels; add corresponding nft_set_ext_type for new key; rebase] Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index fe7c50acc681..504c0aa93805 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -231,6 +231,7 @@ struct nft_userdata { * struct nft_set_elem - generic representation of set elements * * @key: element key + * @key_end: closing element key * @priv: element private data and extensions */ struct nft_set_elem { @@ -238,6 +239,10 @@ struct nft_set_elem { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; + union { + u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; + struct nft_data val; + } key_end; void *priv; }; @@ -502,6 +507,7 @@ void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key + * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout @@ -513,6 +519,7 @@ void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); */ enum nft_set_extensions { NFT_SET_EXT_KEY, + NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, @@ -606,6 +613,11 @@ static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_KEY); } +static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_KEY_END); +} + static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); @@ -655,7 +667,7 @@ static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *data, + const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); -- cgit v1.2.3 From f3a2181e16f1dcbf5446ed43f6b5d9f56c459f85 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 22 Jan 2020 00:17:53 +0100 Subject: netfilter: nf_tables: Support for sets with multiple ranged fields Introduce a new nested netlink attribute, NFTA_SET_DESC_CONCAT, used to specify the length of each field in a set concatenation. This allows set implementations to support concatenation of multiple ranged items, as they can divide the input key into matching data for every single field. Such set implementations would be selected as they specify support for NFT_SET_INTERVAL and allow desc->field_count to be greater than one. Explicitly disallow this for nft_set_rbtree. In order to specify the interval for a set entry, userspace would include in NFTA_SET_DESC_CONCAT attributes field lengths, and pass range endpoints as two separate keys, represented by attributes NFTA_SET_ELEM_KEY and NFTA_SET_ELEM_KEY_END. While at it, export the number of 32-bit registers available for packet matching, as nftables will need this to know the maximum number of field lengths that can be specified. For example, "packets with an IPv4 address between 192.0.2.0 and 192.0.2.42, with destination port between 22 and 25", can be expressed as two concatenated elements: NFTA_SET_ELEM_KEY: 192.0.2.0 . 22 NFTA_SET_ELEM_KEY_END: 192.0.2.42 . 25 and NFTA_SET_DESC_CONCAT attribute would contain: NFTA_LIST_ELEM NFTA_SET_FIELD_LEN: 4 NFTA_LIST_ELEM NFTA_SET_FIELD_LEN: 2 v4: No changes v3: Complete rework, NFTA_SET_DESC_CONCAT instead of NFTA_SET_SUBKEY v2: No changes Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 504c0aa93805..4170c033d461 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -264,11 +264,15 @@ struct nft_set_iter { * @klen: key length * @dlen: data length * @size: number of set elements + * @field_len: length of each field in concatenation, bytes + * @field_count: number of concatenated fields in element */ struct nft_set_desc { unsigned int klen; unsigned int dlen; unsigned int size; + u8 field_len[NFT_REG32_COUNT]; + u8 field_count; }; /** @@ -409,6 +413,8 @@ void nft_unregister_set(struct nft_set_type *type); * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size + * @field_len: length of each field in concatenation, bytes + * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -435,6 +441,8 @@ struct nft_set { u32 dtype; u32 objtype; u32 size; + u8 field_len[NFT_REG32_COUNT]; + u8 field_count; u32 use; atomic_t nelems; u32 ndeact; -- cgit v1.2.3 From 3c4287f62044a90e73a561aa05fc46e62da173da Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 22 Jan 2020 00:17:55 +0100 Subject: nf_tables: Add set type for arbitrary concatenation of ranges This new set type allows for intervals in concatenated fields, which are expressed in the usual way, that is, simple byte concatenation with padding to 32 bits for single fields, and given as ranges by specifying start and end elements containing, each, the full concatenation of start and end values for the single fields. Ranges are expanded to composing netmasks, for each field: these are inserted as rules in per-field lookup tables. Bits to be classified are divided in 4-bit groups, and for each group, the lookup table contains 4^2 buckets, representing all the possible values of a bit group. This approach was inspired by the Grouper algorithm: http://www.cse.usf.edu/~ligatti/projects/grouper/ Matching is performed by a sequence of AND operations between bucket values, with buckets selected according to the value of packet bits, for each group. The result of this sequence tells us which rules matched for a given field. In order to concatenate several ranged fields, per-field rules are mapped using mapping arrays, one per field, that specify which rules should be considered while matching the next field. The mapping array for the last field contains a reference to the element originally inserted. The notes in nft_set_pipapo.c cover the algorithm in deeper detail. A pure hash-based approach is of no use here, as ranges need to be classified. An implementation based on "proxying" the existing red-black tree set type, creating a tree for each field, was considered, but deemed impractical due to the fact that elements would need to be shared between trees, at least as long as we want to keep UAPI changes to a minimum. A stand-alone implementation of this algorithm is available at: https://pipapo.lameexcu.se together with notes about possible future optimisations (in pipapo.c). This algorithm was designed with data locality in mind, and can be highly optimised for SIMD instruction sets, as the bulk of the matching work is done with repetitive, simple bitwise operations. At this point, without further optimisations, nft_concat_range.sh reports, for one AMD Epyc 7351 thread (2.9GHz, 512 KiB L1D$, 8 MiB L2$): TEST: performance net,port [ OK ] baseline (drop from netdev hook): 10190076pps baseline hash (non-ranged entries): 6179564pps baseline rbtree (match on first field only): 2950341pps set with 1000 full, ranged entries: 2304165pps port,net [ OK ] baseline (drop from netdev hook): 10143615pps baseline hash (non-ranged entries): 6135776pps baseline rbtree (match on first field only): 4311934pps set with 100 full, ranged entries: 4131471pps net6,port [ OK ] baseline (drop from netdev hook): 9730404pps baseline hash (non-ranged entries): 4809557pps baseline rbtree (match on first field only): 1501699pps set with 1000 full, ranged entries: 1092557pps port,proto [ OK ] baseline (drop from netdev hook): 10812426pps baseline hash (non-ranged entries): 6929353pps baseline rbtree (match on first field only): 3027105pps set with 30000 full, ranged entries: 284147pps net6,port,mac [ OK ] baseline (drop from netdev hook): 9660114pps baseline hash (non-ranged entries): 3778877pps baseline rbtree (match on first field only): 3179379pps set with 10 full, ranged entries: 2082880pps net6,port,mac,proto [ OK ] baseline (drop from netdev hook): 9718324pps baseline hash (non-ranged entries): 3799021pps baseline rbtree (match on first field only): 1506689pps set with 1000 full, ranged entries: 783810pps net,mac [ OK ] baseline (drop from netdev hook): 10190029pps baseline hash (non-ranged entries): 5172218pps baseline rbtree (match on first field only): 2946863pps set with 1000 full, ranged entries: 1279122pps v4: - fix build for 32-bit architectures: 64-bit division needs div_u64() (kbuild test robot ) v3: - rework interface for field length specification, NFT_SET_SUBKEY disappears and information is stored in description - remove scratch area to store closing element of ranges, as elements now come with an actual attribute to specify the upper range limit (Pablo Neira Ayuso) - also remove pointer to 'start' element from mapping table, closing key is now accessible via extension data - use bytes right away instead of bits for field lengths, this way we can also double the inner loop of the lookup function to take care of upper and lower bits in a single iteration (minor performance improvement) - make it clearer that set operations are actually atomic API-wise, but we can't e.g. implement flush() as one-shot action - fix type for 'dup' in nft_pipapo_insert(), check for duplicates only in the next generation, and in general take care of differentiating generation mask cases depending on the operation (Pablo Neira Ayuso) - report C implementation matching rate in commit message, so that AVX2 implementation can be compared (Pablo Neira Ayuso) v2: - protect access to scratch maps in nft_pipapo_lookup() with local_bh_disable/enable() (Florian Westphal) - drop rcu_read_lock/unlock() from nft_pipapo_lookup(), it's already implied (Florian Westphal) - explain why partial allocation failures don't need handling in pipapo_realloc_scratch(), rename 'm' to clone and update related kerneldoc to make it clear we're not operating on the live copy (Florian Westphal) - add expicit check for priv->start_elem in nft_pipapo_insert() to avoid ending up in nft_pipapo_walk() with a NULL start element, and also zero it out in every operation that might make it invalid, so that insertion doesn't proceed with an invalid element (Florian Westphal) Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 2656155b4069..29e7e1021267 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -74,6 +74,7 @@ extern struct nft_set_type nft_set_hash_type; extern struct nft_set_type nft_set_hash_fast_type; extern struct nft_set_type nft_set_rbtree_type; extern struct nft_set_type nft_set_bitmap_type; +extern struct nft_set_type nft_set_pipapo_type; struct nft_expr; struct nft_regs; -- cgit v1.2.3 From 2e24cd755552350b94a7617617c6877b8cbcb701 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 23 Jan 2020 16:26:18 -0800 Subject: net_sched: fix ops->bind_class() implementations The current implementations of ops->bind_class() are merely searching for classid and updating class in the struct tcf_result, without invoking either of cl_ops->bind_tcf() or cl_ops->unbind_tcf(). This breaks the design of them as qdisc's like cbq use them to count filters too. This is why syzbot triggered the warning in cbq_destroy_class(). In order to fix this, we have to call cl_ops->bind_tcf() and cl_ops->unbind_tcf() like the filter binding path. This patch does so by refactoring out two helper functions __tcf_bind_filter() and __tcf_unbind_filter(), which are lockless and accept a Qdisc pointer, then teaching each implementation to call them correctly. Note, we merely pass the Qdisc pointer as an opaque pointer to each filter, they only need to pass it down to the helper functions without understanding it at all. Fixes: 07d79fc7d94e ("net_sched: add reverse binding for tc class") Reported-and-tested-by: syzbot+0a0596220218fcb603a8@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+63bdb6006961d8c917c6@syzkaller.appspotmail.com Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 33 +++++++++++++++++++-------------- include/net/sch_generic.h | 3 ++- 2 files changed, 21 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index ce036492986a..a972244ab193 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -141,31 +141,38 @@ __cls_set_class(unsigned long *clp, unsigned long cl) return xchg(clp, cl); } -static inline unsigned long -cls_set_class(struct Qdisc *q, unsigned long *clp, unsigned long cl) +static inline void +__tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { - unsigned long old_cl; + unsigned long cl; - sch_tree_lock(q); - old_cl = __cls_set_class(clp, cl); - sch_tree_unlock(q); - return old_cl; + cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); + cl = __cls_set_class(&r->class, cl); + if (cl) + q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; - unsigned long cl; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; - cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); - cl = cls_set_class(q, &r->class, cl); - if (cl) + sch_tree_lock(q); + __tcf_bind_filter(q, r, base); + sch_tree_unlock(q); +} + +static inline void +__tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) +{ + unsigned long cl; + + if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } @@ -173,12 +180,10 @@ static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; - unsigned long cl; if (!q) return; - if ((cl = __cls_set_class(&r->class, 0)) != 0) - q->ops->cl_ops->unbind_tcf(q, cl); + __tcf_unbind_filter(q, r); } struct tcf_exts { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index fceddf89592a..151208704ed2 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -318,7 +318,8 @@ struct tcf_proto_ops { void *type_data); void (*hw_del)(struct tcf_proto *tp, void *type_data); - void (*bind_class)(void *, u32, unsigned long); + void (*bind_class)(void *, u32, unsigned long, + void *, unsigned long); void * (*tmplt_create)(struct net *net, struct tcf_chain *chain, struct nlattr **tca, -- cgit v1.2.3 From 9fd1ff5d2ac7181844735806b0a703c942365291 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sat, 25 Jan 2020 11:26:45 +0100 Subject: udp: Support UDP fraglist GRO/GSO. This patch extends UDP GRO to support fraglist GRO/GSO by using the previously introduced infrastructure. If the feature is enabled, all UDP packets are going to fraglist GRO (local input and forward). After validating the csum, we mark ip_summed as CHECKSUM_UNNECESSARY for fraglist GRO packets to make sure that the csum is not touched. Signed-off-by: Steffen Klassert Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/udp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index bad74f780831..44e0e52b585c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -167,7 +167,7 @@ typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, __be16 dport); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup); + struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, -- cgit v1.2.3 From 41c0d917d11edf9a34461f56e14f067aadb36101 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 27 Jan 2020 04:56:25 -0500 Subject: devlink: add macro for "fw.roce" Add definition and documentation for the new generic info "fw.roce". v2: Remove board.nvm_cfg since fw.psid is similar. Cc: Jiri Pirko Cc: Jakub Kicinski Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 5e46c24bb6e6..ce5cea428fdc 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -487,6 +487,8 @@ enum devlink_param_generic_id { #define DEVLINK_INFO_VERSION_GENERIC_FW_NCSI "fw.ncsi" /* FW parameter set id */ #define DEVLINK_INFO_VERSION_GENERIC_FW_PSID "fw.psid" +/* RoCE FW version */ +#define DEVLINK_INFO_VERSION_GENERIC_FW_ROCE "fw.roce" struct devlink_region; struct devlink_info_req; -- cgit v1.2.3 From 6cd021a58c18a1731f7e47f83e172c0c302d65e5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 27 Jan 2020 15:40:31 -0500 Subject: udp: segment looped gso packets correctly Multicast and broadcast packets can be looped from egress to ingress pre segmentation with dev_loopback_xmit. That function unconditionally sets ip_summed to CHECKSUM_UNNECESSARY. udp_rcv_segment segments gso packets in the udp rx path. Segmentation usually executes on egress, and does not expect packets of this type. __udp_gso_segment interprets !CHECKSUM_PARTIAL as CHECKSUM_NONE. But the offsets are not correct for gso_make_checksum. UDP GSO packets are of type CHECKSUM_PARTIAL, with their uh->check set to the correct pseudo header checksum. Reset ip_summed to this type. (CHECKSUM_PARTIAL is allowed on ingress, see comments in skbuff.h) Reported-by: syzbot Fixes: cf329aa42b66 ("udp: cope with UDP GRO packet misdirection") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/udp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index 44e0e52b585c..4a180f2a13e3 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -476,6 +476,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, if (!inet_get_convert_csum(sk)) features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + if (skb->pkt_type == PACKET_LOOPBACK) + skb->ip_summed = CHECKSUM_PARTIAL; + /* the GSO CB lays after the UDP one, no need to save and restore any * CB fragment */ -- cgit v1.2.3