From ecc6569f3503b39f45bc6b86197b5e0a8533fb72 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 25 Aug 2016 23:08:11 +0800 Subject: netfilter: gre: Use consistent GRE_* macros instead of ones defined by netfilter. There are already some GRE_* macros in kernel, so it is unnecessary to define these macros. And remove some useless macros Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/if_tunnel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 9865c8caedde..fb7337d6b985 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -39,6 +39,7 @@ #define GRE_IS_REC(f) ((f) & GRE_REC) #define GRE_IS_ACK(f) ((f) & GRE_ACK) +#define GRE_VERSION_0 __cpu_to_be16(0x0000) #define GRE_VERSION_1 __cpu_to_be16(0x0001) #define GRE_PROTO_PPP __cpu_to_be16(0x880b) #define GRE_PPTP_KEY_MASK __cpu_to_be32(0xffff) -- cgit v1.2.3 From 0d9932b2875f568d679f2af33ce610da3903ac11 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Fri, 2 Sep 2016 15:05:57 +0200 Subject: netfilter: nft_numgen: rename until attribute by modulus The _until_ attribute is renamed to _modulus_ as the behaviour is similar to other expresions with number limits (ex. nft_hash). Renaming is possible because there isn't a kernel release yet with these changes. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 28ce01d79707..24161e25576d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1126,13 +1126,13 @@ enum nft_trace_types { * enum nft_ng_attributes - nf_tables number generator expression netlink attributes * * @NFTA_NG_DREG: destination register (NLA_U32) - * @NFTA_NG_UNTIL: source value to increment the counter until reset (NLA_U32) + * @NFTA_NG_MODULUS: maximum counter value (NLA_U32) * @NFTA_NG_TYPE: operation type (NLA_U32) */ enum nft_ng_attributes { NFTA_NG_UNSPEC, NFTA_NG_DREG, - NFTA_NG_UNTIL, + NFTA_NG_MODULUS, NFTA_NG_TYPE, __NFTA_NG_MAX }; -- cgit v1.2.3 From 34a3d4b2d1f1b7c81af79f6f93a6cef4c3a0f54a Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 8 Sep 2016 13:55:56 -0400 Subject: xfrm: fix header file comment reference to struct xfrm_replay_state_esn Reported-by: Paul Wouters Signed-off-by: Richard Guy Briggs Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 143338978b48..1fc62b239f1b 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -298,7 +298,7 @@ enum xfrm_attr_type_t { XFRMA_ALG_AUTH_TRUNC, /* struct xfrm_algo_auth */ XFRMA_MARK, /* struct xfrm_mark */ XFRMA_TFCPAD, /* __u32 */ - XFRMA_REPLAY_ESN_VAL, /* struct xfrm_replay_esn */ + XFRMA_REPLAY_ESN_VAL, /* struct xfrm_replay_state_esn */ XFRMA_SA_EXTRA_FLAGS, /* __u32 */ XFRMA_PROTO, /* __u8 */ XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ -- cgit v1.2.3 From 70ca767ea1b2748f45e96192400e515dddbe517c Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 6 Sep 2016 08:44:19 +0200 Subject: netfilter: nft_hash: Add hash offset value Add support to pass through an offset to the hash value. With this feature, the sysadmin is able to generate a hash with a given offset value. Example: meta mark set jhash ip saddr mod 2 seed 0xabcd offset 100 This option generates marks according to the source address from 100 to 101. Signed-off-by: Laura Garcia Liebana --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 24161e25576d..8c653bbd1ead 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -731,6 +731,7 @@ enum nft_meta_keys { * @NFTA_HASH_LEN: source data length (NLA_U32) * @NFTA_HASH_MODULUS: modulus value (NLA_U32) * @NFTA_HASH_SEED: seed value (NLA_U32) + * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32) */ enum nft_hash_attributes { NFTA_HASH_UNSPEC, @@ -739,6 +740,7 @@ enum nft_hash_attributes { NFTA_HASH_LEN, NFTA_HASH_MODULUS, NFTA_HASH_SEED, + NFTA_HASH_OFFSET, __NFTA_HASH_MAX, }; #define NFTA_HASH_MAX (__NFTA_HASH_MAX - 1) -- cgit v1.2.3 From dbd2be0646e3239022630c426cbceefa15714bca Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 7 Sep 2016 12:22:18 +0200 Subject: netfilter: nft_dynset: allow to invert match criteria The dynset expression matches if we can fit a new entry into the set. If there is no room for it, then it breaks the rule evaluation. This patch introduces the inversion flag so you can add rules to explicitly drop packets that don't fit into the set. For example: # nft filter input flow table xyz size 4 { ip saddr timeout 120s counter } overflow drop This is useful to provide a replacement for connlimit. For the rule above, every new entry uses the IPv4 address as key in the set, this entry gets a timeout of 120 seconds that gets refresh on every packet seen. If we get new flow and our set already contains 4 entries already, then this packet is dropped. You can already express this in positive logic, assuming default policy to drop: # nft filter input flow table xyz size 4 { ip saddr timeout 10s counter } accept Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8c653bbd1ead..bc0eb6a1066d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -575,6 +575,10 @@ enum nft_dynset_ops { NFT_DYNSET_OP_UPDATE, }; +enum nft_dynset_flags { + NFT_DYNSET_F_INV = (1 << 0), +}; + /** * enum nft_dynset_attributes - dynset expression attributes * @@ -585,6 +589,7 @@ enum nft_dynset_ops { * @NFTA_DYNSET_SREG_DATA: source register of the data (NLA_U32) * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64) * @NFTA_DYNSET_EXPR: expression (NLA_NESTED: nft_expr_attributes) + * @NFTA_DYNSET_FLAGS: flags (NLA_U32) */ enum nft_dynset_attributes { NFTA_DYNSET_UNSPEC, @@ -596,6 +601,7 @@ enum nft_dynset_attributes { NFTA_DYNSET_TIMEOUT, NFTA_DYNSET_EXPR, NFTA_DYNSET_PAD, + NFTA_DYNSET_FLAGS, __NFTA_DYNSET_MAX, }; #define NFTA_DYNSET_MAX (__NFTA_DYNSET_MAX - 1) -- cgit v1.2.3 From 8e8118f893138d4cc3d4dbf4163d7497fca54a9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 11 Sep 2016 22:55:53 +0200 Subject: netfilter: conntrack: remove packet hotpath stats These counters sit in hot path and do show up in perf, this is especially true for 'found' and 'searched' which get incremented for every packet processed. Information like searched=212030105 new=623431 found=333613 delete=623327 does not seem too helpful nowadays: - on busy systems found and searched will overflow every few hours (these are 32bit integers), other more busy ones every few days. - for debugging there are better methods, such as iptables' trace target, the conntrack log sysctls. Nowadays we also have perf tool. This removes packet path stat counters except those that are expected to be 0 (or close to 0) on a normal system, e.g. 'insert_failed' (race happened) or 'invalid' (proto tracker rejects). The insert stat is retained for the ctnetlink case. The found stat is retained for the tuple-is-taken check when NAT has to determine if it needs to pick a different source address. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 9df789709abe..6deb8867c5fc 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -231,13 +231,13 @@ enum ctattr_secctx { enum ctattr_stats_cpu { CTA_STATS_UNSPEC, - CTA_STATS_SEARCHED, + CTA_STATS_SEARCHED, /* no longer used */ CTA_STATS_FOUND, - CTA_STATS_NEW, + CTA_STATS_NEW, /* no longer used */ CTA_STATS_INVALID, CTA_STATS_IGNORE, - CTA_STATS_DELETE, - CTA_STATS_DELETE_LIST, + CTA_STATS_DELETE, /* no longer used */ + CTA_STATS_DELETE_LIST, /* no longer used */ CTA_STATS_INSERT, CTA_STATS_INSERT_FAILED, CTA_STATS_DROP, -- cgit v1.2.3 From 69ae6ad2ff37911903a90256e216d7e7ae460002 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Fri, 16 Sep 2016 15:05:37 +0200 Subject: net: core: Add offload stats to if_stats_msg Add a nested attribute of offload stats to if_stats_msg named IFLA_STATS_LINK_OFFLOAD_XSTATS. Under it, add SW stats, meaning stats only per packets that went via slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 9bf3aecfe05b..2351776a724f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -826,6 +826,7 @@ enum { IFLA_STATS_LINK_64, IFLA_STATS_LINK_XSTATS, IFLA_STATS_LINK_XSTATS_SLAVE, + IFLA_STATS_LINK_OFFLOAD_XSTATS, __IFLA_STATS_MAX, }; @@ -845,6 +846,14 @@ enum { }; #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) +/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ +enum { + IFLA_OFFLOAD_XSTATS_UNSPEC, + IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + __IFLA_OFFLOAD_XSTATS_MAX +}; +#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) + /* XDP section */ enum { -- cgit v1.2.3 From 4fbae7d83c98c30efcf0a2a2ac55fbb75ef5a1a5 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Fri, 16 Sep 2016 12:59:19 -0700 Subject: ipvlan: Introduce l3s mode In a typical IPvlan L3 setup where master is in default-ns and each slave is into different (slave) ns. In this setup egress packet processing for traffic originating from slave-ns will hit all NF_HOOKs in slave-ns as well as default-ns. However same is not true for ingress processing. All these NF_HOOKs are hit only in the slave-ns skipping them in the default-ns. IPvlan in L3 mode is restrictive and if admins want to deploy iptables rules in default-ns, this asymmetric data path makes it impossible to do so. This patch makes use of the l3_rcv() (added as part of l3mdev enhancements) to perform input route lookup on RX packets without changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN to change the skb->dev just before handing over skb to L4. Signed-off-by: Mahesh Bandewar CC: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2351776a724f..7ec9e99d5491 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -464,6 +464,7 @@ enum { enum ipvlan_mode { IPVLAN_MODE_L2 = 0, IPVLAN_MODE_L3, + IPVLAN_MODE_L3S, IPVLAN_MODE_MAX }; -- cgit v1.2.3 From 408fbc22ef1efb00dd896acd00e9f7d9b641e047 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 18 Sep 2016 07:31:43 -0400 Subject: net sched ife action: Introduce skb tcindex metadata encap decap Sample use case of how this is encoded: user space via tuntap (or a connected VM/Machine/container) encodes the tcindex TLV. Sample use case of decoding: IFE action decodes it and the skb->tc_index is then used to classify. So something like this for encoded ICMP packets: .. first decode then reclassify... skb->tcindex will be set sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xbeef \ u32 match u32 0 0 flowid 1:1 \ action ife decode reclassify ...next match the decode icmp packet... sudo $TC filter add dev $ETH parent ffff: prio 4 protocol ip \ u32 match ip protocol 1 0xff flowid 1:1 \ action continue ... last classify it using the tcindex classifier and do someaction.. sudo $TC filter add dev $ETH parent ffff: prio 5 protocol ip \ handle 0x11 tcindex classid 1:1 \ action blah.. Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_ife.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h index 4ece02a77b9a..cd18360eca24 100644 --- a/include/uapi/linux/tc_act/tc_ife.h +++ b/include/uapi/linux/tc_act/tc_ife.h @@ -32,8 +32,9 @@ enum { #define IFE_META_HASHID 2 #define IFE_META_PRIO 3 #define IFE_META_QMAP 4 +#define IFE_META_TCINDEX 5 /*Can be overridden at runtime by module option*/ -#define __IFE_META_MAX 5 +#define __IFE_META_MAX 6 #define IFE_META_MAX (__IFE_META_MAX - 1) #endif -- cgit v1.2.3 From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Sep 2016 00:26:13 +0200 Subject: bpf: direct packet write and access for helpers for clsact progs This work implements direct packet access for helpers and direct packet write in a similar fashion as already available for XDP types via commits 4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and 6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a complementary feature to the already available direct packet read for tc (cls/act) programs. For enabling this, we need to introduce two helpers, bpf_skb_pull_data() and bpf_csum_update(). The first is generally needed for both, read and write, because they would otherwise only be limited to the current linear skb head. Usually, when the data_end test fails, programs just bail out, or, in the direct read case, use bpf_skb_load_bytes() as an alternative to overcome this limitation. If such data sits in non-linear parts, we can just pull them in once with the new helper, retest and eventually access them. At the same time, this also makes sure the skb is uncloned, which is, of course, a necessary condition for direct write. As this needs to be an invariant for the write part only, the verifier detects writes and adds a prologue that is calling bpf_skb_pull_data() to effectively unclone the skb from the very beginning in case it is indeed cloned. The heuristic makes use of a similar trick that was done in 233577a22089 ("net: filter: constify detection of pkt_type_offset"). This comes at zero cost for other programs that do not use the direct write feature. Should a program use this feature only sparsely and has read access for the most parts with, for example, drop return codes, then such write action can be delegated to a tail called program for mitigating this cost of potential uncloning to a late point in time where it would have been paid similarly with the bpf_skb_store_bytes() as well. Advantage of direct write is that the writes are inlined whereas the helper cannot make any length assumptions and thus needs to generate a call to memcpy() also for small sizes, as well as cost of helper call itself with sanity checks are avoided. Plus, when direct read is already used, we don't need to cache or perform rechecks on the data boundaries (due to verifier invalidating previous checks for helpers that change skb->data), so more complex programs using rewrites can benefit from switching to direct read plus write. For direct packet access to helpers, we save the otherwise needed copy into a temp struct sitting on stack memory when use-case allows. Both facilities are enabled via may_access_direct_pkt_data() in verifier. For now, we limit this to map helpers and csum_diff, and can successively enable other helpers where we find it makes sense. Helpers that definitely cannot be allowed for this are those part of bpf_helper_changes_skb_data() since they can change underlying data, and those that write into memory as this could happen for packet typed args when still cloned. bpf_csum_update() helper accommodates for the fact that we need to fixup checksum_complete when using direct write instead of bpf_skb_store_bytes(), meaning the programs can use available helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(), csum_block_add(), csum_block_sub() equivalents in eBPF together with the new helper. A usage example will be provided for iproute2's examples/bpf/ directory. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f896dfac4ac0..e07432b9f8b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -398,6 +398,27 @@ enum bpf_func_id { */ BPF_FUNC_skb_change_tail, + /** + * bpf_skb_pull_data(skb, len) + * The helper will pull in non-linear data in case the + * skb is non-linear and not all of len are part of the + * linear section. Only needed for read/write with direct + * packet access. + * @skb: pointer to skb + * @len: len to make read/writeable + * Return: 0 on success or negative error + */ + BPF_FUNC_skb_pull_data, + + /** + * bpf_csum_update(skb, csum) + * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. + * @skb: pointer to skb + * @csum: csum to add + * Return: csum on success or negative error + */ + BPF_FUNC_csum_update, + __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 77879147a3481babffd7e368d977ab682545a6bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Sep 2016 23:39:11 -0400 Subject: net_sched: sch_fq: add low_rate_threshold parameter This commit adds to the fq module a low_rate_threshold parameter to insert a delay after all packets if the socket requests a pacing rate below the threshold. This helps achieve more precise control of the sending rate with low-rate paths, especially policers. The basic issue is that if a congestion control module detects a policer at a certain rate, it may want fq to be able to shape to that policed rate. That way the sender can avoid policer drops by having the packets arrive at the policer at or just under the policed rate. The default threshold of 550Kbps was chosen analytically so that for policers or links at 500Kbps or 512Kbps fq would very likely invoke this mechanism, even if the pacing rate was briefly slightly above the available bandwidth. This value was then empirically validated with two years of production testing on YouTube video servers. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 2382eed50278..f8e39dbaa781 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -792,6 +792,8 @@ enum { TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ + TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + __TCA_FQ_MAX }; -- cgit v1.2.3 From eb8329e0a04db0061f714f033b4454326ba147f4 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:16 -0400 Subject: tcp: export data delivery rate This commit export two new fields in struct tcp_info: tcpi_delivery_rate: The most recent goodput, as measured by tcp_rate_gen(). If the socket is limited by the sending application (e.g., no data to send), it reports the highest measurement instead of the most recent. The unit is bytes per second (like other rate fields in tcp_info). tcpi_delivery_rate_app_limited: A boolean indicating if the goodput was measured when the socket's throughput was limited by the sending application. This delivery rate information can be useful for applications that want to know the current throughput the TCP connection is seeing, e.g. adaptive bitrate video streaming. It can also be very useful for debugging or troubleshooting. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 482898fc433a..73ac0db487f8 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -167,6 +167,7 @@ struct tcp_info { __u8 tcpi_backoff; __u8 tcpi_options; __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; + __u8 tcpi_delivery_rate_app_limited:1; __u32 tcpi_rto; __u32 tcpi_ato; @@ -211,6 +212,8 @@ struct tcp_info { __u32 tcpi_min_rtt; __u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */ __u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ + + __u64 tcpi_delivery_rate; }; /* for TCP_MD5SIG socket option */ -- cgit v1.2.3 From 0f8782ea14974ce992618b55f0c041ef43ed0b78 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:23 -0400 Subject: tcp_bbr: add BBR congestion control This commit implements a new TCP congestion control algorithm: BBR (Bottleneck Bandwidth and RTT). A detailed description of BBR will be published in ACM Queue, Vol. 14 No. 5, September-October 2016, as "BBR: Congestion-Based Congestion Control". BBR has significantly increased throughput and reduced latency for connections on Google's internal backbone networks and google.com and YouTube Web servers. BBR requires only changes on the sender side, not in the network or the receiver side. Thus it can be incrementally deployed on today's Internet, or in datacenters. The Internet has predominantly used loss-based congestion control (largely Reno or CUBIC) since the 1980s, relying on packet loss as the signal to slow down. While this worked well for many years, loss-based congestion control is unfortunately out-dated in today's networks. On today's Internet, loss-based congestion control causes the infamous bufferbloat problem, often causing seconds of needless queuing delay, since it fills the bloated buffers in many last-mile links. On today's high-speed long-haul links using commodity switches with shallow buffers, loss-based congestion control has abysmal throughput because it over-reacts to losses caused by transient traffic bursts. In 1981 Kleinrock and Gale showed that the optimal operating point for a network maximizes delivered bandwidth while minimizing delay and loss, not only for single connections but for the network as a whole. Finding that optimal operating point has been elusive, since any single network measurement is ambiguous: network measurements are the result of both bandwidth and propagation delay, and those two cannot be measured simultaneously. While it is impossible to disambiguate any single bandwidth or RTT measurement, a connection's behavior over time tells a clearer story. BBR uses a measurement strategy designed to resolve this ambiguity. It combines these measurements with a robust servo loop using recent control systems advances to implement a distributed congestion control algorithm that reacts to actual congestion, not packet loss or transient queue delay, and is designed to converge with high probability to a point near the optimal operating point. In a nutshell, BBR creates an explicit model of the network pipe by sequentially probing the bottleneck bandwidth and RTT. On the arrival of each ACK, BBR derives the current delivery rate of the last round trip, and feeds it through a windowed max-filter to estimate the bottleneck bandwidth. Conversely it uses a windowed min-filter to estimate the round trip propagation delay. The max-filtered bandwidth and min-filtered RTT estimates form BBR's model of the network pipe. Using its model, BBR sets control parameters to govern sending behavior. The primary control is the pacing rate: BBR applies a gain multiplier to transmit faster or slower than the observed bottleneck bandwidth. The conventional congestion window (cwnd) is now the secondary control; the cwnd is set to a small multiple of the estimated BDP (bandwidth-delay product) in order to allow full utilization and bandwidth probing while bounding the potential amount of queue at the bottleneck. When a BBR connection starts, it enters STARTUP mode and applies a high gain to perform an exponential search to quickly probe the bottleneck bandwidth (doubling its sending rate each round trip, like slow start). However, instead of continuing until it fills up the buffer (i.e. a loss), or until delay or ACK spacing reaches some threshold (like Hystart), it uses its model of the pipe to estimate when that pipe is full: it estimates the pipe is full when it notices the estimated bandwidth has stopped growing. At that point it exits STARTUP and enters DRAIN mode, where it reduces its pacing rate to drain the queue it estimates it has created. Then BBR enters steady state. In steady state, PROBE_BW mode cycles between first pacing faster to probe for more bandwidth, then pacing slower to drain any queue that created if no more bandwidth was available, and then cruising at the estimated bandwidth to utilize the pipe without creating excess queue. Occasionally, on an as-needed basis, it sends significantly slower to probe for RTT (PROBE_RTT mode). BBR has been fully deployed on Google's wide-area backbone networks and we're experimenting with BBR on Google.com and YouTube on a global scale. Replacing CUBIC with BBR has resulted in significant improvements in network latency and application (RPC, browser, and video) metrics. For more details please refer to our upcoming ACM Queue publication. Example performance results, to illustrate the difference between BBR and CUBIC: Resilience to random loss (e.g. from shallow buffers): Consider a netperf TCP_STREAM test lasting 30 secs on an emulated path with a 10Gbps bottleneck, 100ms RTT, and 1% packet loss rate. CUBIC gets 3.27 Mbps, and BBR gets 9150 Mbps (2798x higher). Low latency with the bloated buffers common in today's last-mile links: Consider a netperf TCP_STREAM test lasting 120 secs on an emulated path with a 10Mbps bottleneck, 40ms RTT, and 1000-packet bottleneck buffer. Both fully utilize the bottleneck bandwidth, but BBR achieves this with a median RTT 25x lower (43 ms instead of 1.09 secs). Our long-term goal is to improve the congestion control algorithms used on the Internet. We are hopeful that BBR can help advance the efforts toward this goal, and motivate the community to do further research. Test results, performance evaluations, feedback, and BBR-related discussions are very welcome in the public e-mail list for BBR: https://groups.google.com/forum/#!forum/bbr-dev NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, since pacing is integral to the BBR design and implementation. BBR without pacing would not function properly, and may incur unnecessary high packet loss rates. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index b5c366f87b3e..509cd961068d 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -124,6 +124,7 @@ enum { INET_DIAG_PEERS, INET_DIAG_PAD, INET_DIAG_MARK, + INET_DIAG_BBRINFO, __INET_DIAG_MAX, }; @@ -157,8 +158,20 @@ struct tcp_dctcp_info { __u32 dctcp_ab_tot; }; +/* INET_DIAG_BBRINFO */ + +struct tcp_bbr_info { + /* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */ + __u32 bbr_bw_lo; /* lower 32 bits of bw */ + __u32 bbr_bw_hi; /* upper 32 bits of bw */ + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ +}; + union tcp_cc_info { struct tcpvegas_info vegas; struct tcp_dctcp_info dctcp; + struct tcp_bbr_info bbr; }; #endif /* _UAPI_INET_DIAG_H_ */ -- cgit v1.2.3 From 0d01d45f1b251448590c710baa32f722e43c62c7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:54 +0100 Subject: net: cls_bpf: limit hardware offload by software-only flag Add cls_bpf support for the TCA_CLS_FLAGS_SKIP_HW flag. Unlike U32 and flower cls_bpf already has some netlink flags defined. Create a new attribute to be able to use the same flag values as the above. Unlike U32 and flower reject unknown flags. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 8915b61bbf83..8fd715f806a2 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -396,6 +396,7 @@ enum { TCA_BPF_FD, TCA_BPF_NAME, TCA_BPF_FLAGS, + TCA_BPF_FLAGS_GEN, __TCA_BPF_MAX, }; -- cgit v1.2.3 From 45a497f2d149a4a8061c61518a79d59f1f3034b2 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 19 Sep 2016 19:11:10 +0300 Subject: net/sched: act_vlan: Introduce TCA_VLAN_ACT_MODIFY vlan action TCA_VLAN_ACT_MODIFY allows one to change an existing tag. It accepts same attributes as TCA_VLAN_ACT_PUSH (protocol, id, priority). If packet is vlan tagged, then the tag gets overwritten according to user specified attributes. For example, this allows user to replace a tag's vid while preserving its priority bits (as opposed to "action vlan pop pipe action vlan push"). Signed-off-by: Shmulik Ladkani Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_vlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h index be72b6e3843b..bddb272b843f 100644 --- a/include/uapi/linux/tc_act/tc_vlan.h +++ b/include/uapi/linux/tc_act/tc_vlan.h @@ -16,6 +16,7 @@ #define TCA_VLAN_ACT_POP 1 #define TCA_VLAN_ACT_PUSH 2 +#define TCA_VLAN_ACT_MODIFY 3 struct tc_vlan { tc_gen; -- cgit v1.2.3 From 2b03bf732488a3c2e920afe22c03b82cb8477e28 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 13 Sep 2016 13:49:53 +0200 Subject: netfilter: nft_numgen: add number generation offset Add support of an offset value for incremental counter and random. With this option the sysadmin is able to start the counter to a certain value and then apply the generated number. Example: meta mark set numgen inc mod 2 offset 100 This will generate marks with the serie 100, 101, 100, 101, ... Suggested-by: Pablo Neira Ayuso Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index bc0eb6a1066d..bcfb892ff148 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1136,12 +1136,14 @@ enum nft_trace_types { * @NFTA_NG_DREG: destination register (NLA_U32) * @NFTA_NG_MODULUS: maximum counter value (NLA_U32) * @NFTA_NG_TYPE: operation type (NLA_U32) + * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32) */ enum nft_ng_attributes { NFTA_NG_UNSPEC, NFTA_NG_DREG, NFTA_NG_MODULUS, NFTA_NG_TYPE, + NFTA_NG_OFFSET, __NFTA_NG_MAX }; #define NFTA_NG_MAX (__NFTA_NG_MAX - 1) -- cgit v1.2.3 From 8061bb54436c19fd16b7c734a69ff60bac26e3e9 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 14 Sep 2016 23:41:46 +0800 Subject: netfilter: nft_queue: add _SREG_QNUM attr to select the queue number Currently, the user can specify the queue numbers by _QUEUE_NUM and _QUEUE_TOTAL attributes, this is enough in most situations. But acctually, it is not very flexible, for example: tcp dport 80 mapped to queue0 tcp dport 81 mapped to queue1 tcp dport 82 mapped to queue2 In order to do this thing, we must add 3 nft rules, and more mapping meant more rules ... So take one register to select the queue number, then we can add one simple rule to mapping queues, maybe like this: queue num tcp dport map { 80:0, 81:1, 82:2 ... } Florian Westphal also proposed wider usage scenarios: queue num jhash ip saddr . ip daddr mod ... queue num meta cpu ... queue num meta mark ... The last point is how to load a queue number from sreg, although we can use *(u16*)®s->data[reg] to load the queue number, just like nat expr to load its l4port do. But we will cooperate with hash expr, meta cpu, meta mark expr and so on. They all store the result to u32 type, so cast it to u16 pointer and dereference it will generate wrong result in the big endian system. So just keep it simple, we treat queue number as u32 type, although u16 type is already enough. Suggested-by: Pablo Neira Ayuso Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index bcfb892ff148..1cf41dd838b2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -894,12 +894,14 @@ enum nft_log_attributes { * @NFTA_QUEUE_NUM: netlink queue to send messages to (NLA_U16) * @NFTA_QUEUE_TOTAL: number of queues to load balance packets on (NLA_U16) * @NFTA_QUEUE_FLAGS: various flags (NLA_U16) + * @NFTA_QUEUE_SREG_QNUM: source register of queue number (NLA_U32: nft_registers) */ enum nft_queue_attributes { NFTA_QUEUE_UNSPEC, NFTA_QUEUE_NUM, NFTA_QUEUE_TOTAL, NFTA_QUEUE_FLAGS, + NFTA_QUEUE_SREG_QNUM, __NFTA_QUEUE_MAX }; #define NFTA_QUEUE_MAX (__NFTA_QUEUE_MAX - 1) -- cgit v1.2.3 From fefa569a9d4bc4b7758c0fddd75bb0382c95da77 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Sep 2016 08:58:55 -0700 Subject: net_sched: sch_fq: account for schedule/timers drifts It looks like the following patch can make FQ very precise, even in VM or stressed hosts. It matters at high pacing rates. We take into account the difference between the time that was programmed when last packet was sent, and current time (a drift of tens of usecs is often observed) Add an EWMA of the unthrottle latency to help diagnostics. This latency is the difference between current time and oldest packet in delayed RB-tree. This accounts for the high resolution timer latency, but can be different under stress, as fq_check_throttled() can be opportunistically be called from a dequeue() called after an enqueue() for a different flow. Tested: // Start a 10Gbit flow $ netperf --google-pacing-rate 1250000000 -H lpaa24 -l 10000 -- -K bbr & Before patch : $ sar -n DEV 10 5 | grep eth0 | grep Average Average: eth0 17106.04 756876.84 1102.75 1119049.02 0.00 0.00 0.52 After patch : $ sar -n DEV 10 5 | grep eth0 | grep Average Average: eth0 17867.00 800245.90 1151.77 1183172.12 0.00 0.00 0.52 A new iproute2 tc can output the 'unthrottle latency' : $ tc -s qd sh dev eth0 | grep latency 0 gc, 0 highprio, 32490767 throttled, 2382 ns latency Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index f8e39dbaa781..df7451d35131 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -811,7 +811,7 @@ struct tc_fq_qd_stats { __u32 flows; __u32 inactive_flows; __u32 throttled_flows; - __u32 pad; + __u32 unthrottle_latency_ns; }; /* Heavy-Hitter Filter */ -- cgit v1.2.3 From 7a4b28c6cc9ffac50f791b99cc7e46106436e5d8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Sep 2016 01:28:37 +0200 Subject: bpf: add helper to invalidate hash Add a small helper that complements 36bbef52c7eb ("bpf: direct packet write and access for helpers for clsact progs") for invalidating the current skb->hash after mangling on headers via direct packet write. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e07432b9f8b8..f09c70b97eca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -419,6 +419,13 @@ enum bpf_func_id { */ BPF_FUNC_csum_update, + /** + * bpf_set_hash_invalid(skb) + * Invalidate current skb>hash. + * @skb: pointer to skb + */ + BPF_FUNC_set_hash_invalid, + __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 79aab093a0b5370d7fc4e99df75996f4744dc03f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 22 Sep 2016 12:11:15 +0300 Subject: net: Update API for VF vlan protocol 802.1ad support Introduce new rtnl UAPI that exposes a list of vlans per VF, giving the ability for user-space application to specify it for the VF, as an option to support 802.1ad. We adjusted IP Link tool to support this option. For future use cases, the new UAPI supports multiple vlans. For now we limit the list size to a single vlan in kernel. Add IFLA_VF_VLAN_LIST in addition to IFLA_VF_VLAN to keep backward compatibility with older versions of IP Link tool. Add a vlan protocol parameter to the ndo_set_vf_vlan callback. We kept 802.1Q as the drivers' default vlan protocol. Suitable ip link tool command examples: Set vf vlan protocol 802.1ad: ip link set eth0 vf 1 vlan 100 proto 802.1ad Set vf to VST (802.1Q) mode: ip link set eth0 vf 1 vlan 100 proto 802.1Q Or by omitting the new parameter ip link set eth0 vf 1 vlan 100 Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7ec9e99d5491..b4fba662cd32 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -619,7 +619,7 @@ enum { enum { IFLA_VF_UNSPEC, IFLA_VF_MAC, /* Hardware queue specific attributes */ - IFLA_VF_VLAN, + IFLA_VF_VLAN, /* VLAN ID and QoS */ IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ @@ -631,6 +631,7 @@ enum { IFLA_VF_TRUST, /* Trust VF */ IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ + IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ __IFLA_VF_MAX, }; @@ -647,6 +648,22 @@ struct ifla_vf_vlan { __u32 qos; }; +enum { + IFLA_VF_VLAN_INFO_UNSPEC, + IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ + __IFLA_VF_VLAN_INFO_MAX, +}; + +#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) +#define MAX_VLAN_LIST_LEN 1 + +struct ifla_vf_vlan_info { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; + __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ +}; + struct ifla_vf_tx_rate { __u32 vf; __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ -- cgit v1.2.3 From 11d5f15723c9f39d7c131d0149d024c17dbef676 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Thu, 22 Sep 2016 12:43:44 -0400 Subject: netfilter: xt_hashlimit: Create revision 2 to support higher pps rates Create a new revision for the hashlimit iptables extension module. Rev 2 will support higher pps of upto 1 million, Version 1 supports only 10k. To support this we have to increase the size of the variables avg and burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2 and xt_hashlimit_mtinfo2 and also create newer versions of all the functions for match, checkentry and destroy. Some of the functions like hashlimit_mt, hashlimit_mt_check etc are very similar in both rev1 and rev2 with only minor changes, so I have split those functions and moved all the common code to a *_common function. Signed-off-by: Vishwanath Pai Signed-off-by: Joshua Hunt Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_hashlimit.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h index 6db90372f09c..3efc0ca18345 100644 --- a/include/uapi/linux/netfilter/xt_hashlimit.h +++ b/include/uapi/linux/netfilter/xt_hashlimit.h @@ -6,6 +6,7 @@ /* timings are in milliseconds. */ #define XT_HASHLIMIT_SCALE 10000 +#define XT_HASHLIMIT_SCALE_v2 1000000llu /* 1/10,000 sec period => max of 10,000/sec. Min rate is then 429490 * seconds, or one packet every 59 hours. */ @@ -63,6 +64,20 @@ struct hashlimit_cfg1 { __u8 srcmask, dstmask; }; +struct hashlimit_cfg2 { + __u64 avg; /* Average secs between packets * scale */ + __u64 burst; /* Period multiplier for upper limit. */ + __u32 mode; /* bitmask of XT_HASHLIMIT_HASH_* */ + + /* user specified */ + __u32 size; /* how many buckets */ + __u32 max; /* max number of entries */ + __u32 gc_interval; /* gc interval */ + __u32 expire; /* when do entries expire? */ + + __u8 srcmask, dstmask; +}; + struct xt_hashlimit_mtinfo1 { char name[IFNAMSIZ]; struct hashlimit_cfg1 cfg; @@ -71,4 +86,12 @@ struct xt_hashlimit_mtinfo1 { struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); }; +struct xt_hashlimit_mtinfo2 { + char name[NAME_MAX]; + struct hashlimit_cfg2 cfg; + + /* Used internally by the kernel */ + struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); +}; + #endif /* _UAPI_XT_HASHLIMIT_H */ -- cgit v1.2.3 From 0f3cd9b3697708c86a825ae3cedabf7be6fd3e72 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 23 Sep 2016 15:23:33 +0200 Subject: netfilter: nf_tables: add range expression Inverse ranges != [a,b] are not currently possible because rules are composites of && operations, and we need to express this: data < a || data > b This patch adds a new range expression. Positive ranges can be already through two cmp expressions: cmp(sreg, data, >=) cmp(sreg, data, <=) This new range expression provides an alternative way to express this. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 1cf41dd838b2..c6c4477c136b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -546,6 +546,35 @@ enum nft_cmp_attributes { }; #define NFTA_CMP_MAX (__NFTA_CMP_MAX - 1) +/** + * enum nft_range_ops - nf_tables range operator + * + * @NFT_RANGE_EQ: equal + * @NFT_RANGE_NEQ: not equal + */ +enum nft_range_ops { + NFT_RANGE_EQ, + NFT_RANGE_NEQ, +}; + +/** + * enum nft_range_attributes - nf_tables range expression netlink attributes + * + * @NFTA_RANGE_SREG: source register of data to compare (NLA_U32: nft_registers) + * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_cmp_ops) + * @NFTA_RANGE_FROM_DATA: data range from (NLA_NESTED: nft_data_attributes) + * @NFTA_RANGE_TO_DATA: data range to (NLA_NESTED: nft_data_attributes) + */ +enum nft_range_attributes { + NFTA_RANGE_UNSPEC, + NFTA_RANGE_SREG, + NFTA_RANGE_OP, + NFTA_RANGE_FROM_DATA, + NFTA_RANGE_TO_DATA, + __NFTA_RANGE_MAX +}; +#define NFTA_RANGE_MAX (__NFTA_RANGE_MAX - 1) + enum nft_lookup_flags { NFT_LOOKUP_F_INV = (1 << 0), }; -- cgit v1.2.3 From ff107d27761ff4b644c82c209e004ec9c8fbbc22 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 25 Sep 2016 16:35:56 +0800 Subject: netfilter: nft_log: complete NFTA_LOG_FLAGS attr support NFTA_LOG_FLAGS attribute is already supported, but the related NF_LOG_XXX flags are not exposed to the userspace. So we cannot explicitly enable log flags to log uid, tcp sequence, ip options and so on, i.e. such rule "nft add rule filter output log uid" is not supported yet. So move NF_LOG_XXX macro definitions to the uapi/../nf_log.h. In order to keep consistent with other modules, change NF_LOG_MASK to refer to all supported log flags. On the other hand, add a new NF_LOG_DEFAULT_MASK to refer to the original default log flags. Finally, if user specify the unsupported log flags or NFTA_LOG_GROUP and NFTA_LOG_FLAGS are set at the same time, report EINVAL to the userspace. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_log.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 include/uapi/linux/netfilter/nf_log.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_log.h b/include/uapi/linux/netfilter/nf_log.h new file mode 100644 index 000000000000..8be21e02387d --- /dev/null +++ b/include/uapi/linux/netfilter/nf_log.h @@ -0,0 +1,12 @@ +#ifndef _NETFILTER_NF_LOG_H +#define _NETFILTER_NF_LOG_H + +#define NF_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ +#define NF_LOG_TCPOPT 0x02 /* Log TCP options */ +#define NF_LOG_IPOPT 0x04 /* Log IP options */ +#define NF_LOG_UID 0x08 /* Log UID owning local socket */ +#define NF_LOG_NFLOG 0x10 /* Unsupported, don't reuse */ +#define NF_LOG_MACDECODE 0x20 /* Decode MAC header */ +#define NF_LOG_MASK 0x2f + +#endif /* _NETFILTER_NF_LOG_H */ -- cgit v1.2.3 From bd11f0741fa5a2c296629898ad07759dd12b35bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 27 Sep 2016 23:57:58 -0700 Subject: ipv6 addrconf: implement RFC7559 router solicitation backoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements: https://tools.ietf.org/html/rfc7559 Backoff is performed according to RFC3315 section 14: https://tools.ietf.org/html/rfc3315#section-14 We allow setting /proc/sys/net/ipv6/conf/*/router_solicitations to a negative value meaning an unlimited number of retransmits, and we make this the new default (inline with the RFC). We also add a new setting: /proc/sys/net/ipv6/conf/*/router_solicitation_max_interval defaulting to 1 hour (per RFC recommendation). Signed-off-by: Maciej Żenczykowski Acked-by: Erik Kline Signed-off-by: David S. Miller --- include/uapi/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 395876060f50..8c2772340c3f 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -177,6 +177,7 @@ enum { DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, DEVCONF_DROP_UNSOLICITED_NA, DEVCONF_KEEP_ADDR_ON_DOWN, + DEVCONF_RTR_SOLICIT_MAX_INTERVAL, DEVCONF_MAX }; -- cgit v1.2.3