diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/devinet.c | 86 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 4 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 15 | ||||
-rw-r--r-- | net/ipv4/fou.c | 9 | ||||
-rw-r--r-- | net/ipv4/gre_demux.c | 17 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 68 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 293 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 289 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 142 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 3 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 70 | ||||
-rw-r--r-- | net/ipv4/ip_vti.c | 50 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 2 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 61 | ||||
-rw-r--r-- | net/ipv4/netfilter.c | 18 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 72 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 21 | ||||
-rw-r--r-- | net/ipv4/tcp_bbr.c | 180 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 18 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 4 | ||||
-rw-r--r-- | net/ipv4/udp.c | 18 | ||||
-rw-r--r-- | net/ipv4/udp_impl.h | 1 | ||||
-rw-r--r-- | net/ipv4/udplite.c | 1 |
27 files changed, 906 insertions, 550 deletions
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e258a00b4a3d..cd9033245b98 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2063,13 +2063,49 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; +static int inet_netconf_valid_get_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_ipv4_policy, extack); + + err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_ipv4_policy, extack); + if (err) + return err; + + for (i = 0; i <= NETCONFA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case NETCONFA_IFINDEX: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request"); + return -EINVAL; + } + } + + return 0; +} + static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; - struct netconfmsg *ncm; struct sk_buff *skb; struct ipv4_devconf *devconf; struct in_device *in_dev; @@ -2077,9 +2113,8 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, int ifindex; int err; - err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_ipv4_policy, extack); - if (err < 0) + err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); + if (err) goto errout; err = -EINVAL; @@ -2556,32 +2591,32 @@ static __net_init int devinet_init_net(struct net *net) int err; struct ipv4_devconf *all, *dflt; #ifdef CONFIG_SYSCTL - struct ctl_table *tbl = ctl_forward_entry; + struct ctl_table *tbl; struct ctl_table_header *forw_hdr; #endif err = -ENOMEM; - all = &ipv4_devconf; - dflt = &ipv4_devconf_dflt; + all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); + if (!all) + goto err_alloc_all; - if (!net_eq(net, &init_net)) { - all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); - if (!all) - goto err_alloc_all; - - dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); - if (!dflt) - goto err_alloc_dflt; + dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); + if (!dflt) + goto err_alloc_dflt; #ifdef CONFIG_SYSCTL - tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL); - if (!tbl) - goto err_alloc_ctl; + tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); + if (!tbl) + goto err_alloc_ctl; - tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; - tbl[0].extra1 = all; - tbl[0].extra2 = net; + tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; + tbl[0].extra1 = all; + tbl[0].extra2 = net; #endif + + if (sysctl_devconf_inherit_init_net != 2 && !net_eq(net, &init_net)) { + memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); + memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); } #ifdef CONFIG_SYSCTL @@ -2611,15 +2646,12 @@ err_reg_ctl: err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: - if (tbl != ctl_forward_entry) - kfree(tbl); + kfree(tbl); err_alloc_ctl: #endif - if (dflt != &ipv4_devconf_dflt) - kfree(dflt); + kfree(dflt); err_alloc_dflt: - if (all != &ipv4_devconf) - kfree(all); + kfree(all); err_alloc_all: return err; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6df95be96311..fe4f6a624238 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -203,7 +203,7 @@ static void fib_flush(struct net *net) struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) - flushed += fib_table_flush(net, tb); + flushed += fib_table_flush(net, tb, false); } if (flushed) @@ -1463,7 +1463,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); - fib_table_flush(net, tb); + fib_table_flush(net, tb, true); fib_free_table(tb); } } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 237c9f72b265..a573e37e0615 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1856,7 +1856,7 @@ void fib_table_flush_external(struct fib_table *tb) } /* Caller must hold RTNL. */ -int fib_table_flush(struct net *net, struct fib_table *tb) +int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; @@ -1904,8 +1904,17 @@ int fib_table_flush(struct net *net, struct fib_table *tb) hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; - if (!fi || !(fi->fib_flags & RTNH_F_DEAD) || - tb->tb_id != fa->tb_id) { + if (!fi || tb->tb_id != fa->tb_id || + (!(fi->fib_flags & RTNH_F_DEAD) && + !fib_props[fa->fa_type].error)) { + slen = fa->fa_slen; + continue; + } + + /* Do not flush error routes if network namespace is + * not being dismantled + */ + if (!flush_all && fib_props[fa->fa_type].error) { slen = fa->fa_slen; continue; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 632863541082..437070d1ffb1 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1020,10 +1020,11 @@ static int gue_err(struct sk_buff *skb, u32 info) { int transport_offset = skb_transport_offset(skb); struct guehdr *guehdr; - size_t optlen; + size_t len, optlen; int ret; - if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + len = sizeof(struct udphdr) + sizeof(struct guehdr); + if (!pskb_may_pull(skb, len)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; @@ -1058,6 +1059,10 @@ static int gue_err(struct sk_buff *skb, u32 info) optlen = guehdr->hlen << 2; + if (!pskb_may_pull(skb, len + optlen)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; if (validate_gue_flags(guehdr, optlen)) return -EINVAL; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index a4bf22ee3aed..7c4a41dc04bb 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -25,6 +25,7 @@ #include <linux/spinlock.h> #include <net/protocol.h> #include <net/gre.h> +#include <net/erspan.h> #include <net/icmp.h> #include <net/route.h> @@ -119,6 +120,22 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, hdr_len += 4; } tpi->hdr_len = hdr_len; + + /* ERSPAN ver 1 and 2 protocol sets GRE key field + * to 0 and sets the configured key in the + * inner erspan header field + */ + if (greh->protocol == htons(ETH_P_ERSPAN) || + greh->protocol == htons(ETH_P_ERSPAN2)) { + struct erspan_base_hdr *ershdr; + + if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr))) + return -EINVAL; + + ershdr = (struct erspan_base_hdr *)options; + tpi->key = cpu_to_be32(get_session_id(ershdr)); + } + return hdr_len; } EXPORT_SYMBOL(gre_parse_header); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 765b2b32c4a4..a40e48ded10d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1493,22 +1493,22 @@ static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) len += sizeof(struct igmpv3_report); - return pskb_may_pull(skb, len) ? 0 : -EINVAL; + return ip_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ip_mc_check_igmp_query(struct sk_buff *skb) { - unsigned int len = skb_transport_offset(skb); - - len += sizeof(struct igmphdr); - if (skb->len < len) - return -EINVAL; + unsigned int transport_len = ip_transport_len(skb); + unsigned int len; /* IGMPv{1,2}? */ - if (skb->len != len) { + if (transport_len != sizeof(struct igmphdr)) { /* or IGMPv3? */ - len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); - if (skb->len < len || !pskb_may_pull(skb, len)) + if (transport_len < sizeof(struct igmpv3_query)) + return -EINVAL; + + len = skb_transport_offset(skb) + sizeof(struct igmpv3_query); + if (!ip_mc_may_pull(skb, len)) return -EINVAL; } @@ -1544,47 +1544,29 @@ static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) return skb_checksum_simple_validate(skb); } -static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) - +static int ip_mc_check_igmp_csum(struct sk_buff *skb) { - struct sk_buff *skb_chk; - unsigned int transport_len; unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); - int ret = -EINVAL; + unsigned int transport_len = ip_transport_len(skb); + struct sk_buff *skb_chk; - transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + if (!ip_mc_may_pull(skb, len)) + return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ip_mc_validate_checksum); if (!skb_chk) - goto err; - - if (!pskb_may_pull(skb_chk, len)) - goto err; - - ret = ip_mc_check_igmp_msg(skb_chk); - if (ret) - goto err; - - if (skb_trimmed) - *skb_trimmed = skb_chk; - /* free now unneeded clone */ - else if (skb_chk != skb) - kfree_skb(skb_chk); - - ret = 0; + return -EINVAL; -err: - if (ret && skb_chk && skb_chk != skb) + if (skb_chk != skb) kfree_skb(skb_chk); - return ret; + return 0; } /** * ip_mc_check_igmp - checks whether this is a sane IGMP packet * @skb: the skb to validate - * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) * * Checks whether an IPv4 packet is a valid IGMP packet. If so sets * skb transport header accordingly and returns zero. @@ -1594,18 +1576,10 @@ err: * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. * -ENOMEM: A memory allocation failure happened. * - * Optionally, an skb pointer might be provided via skb_trimmed (or set it - * to NULL): After parsing an IGMP packet successfully it will point to - * an skb which has its tail aligned to the IP packet end. This might - * either be the originally provided skb or a trimmed, cloned version if - * the skb frame had data beyond the IP packet. A cloned skb allows us - * to leave the original skb and its full frame unchanged (which might be - * desirable for layer 2 frame jugglers). - * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ -int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +int ip_mc_check_igmp(struct sk_buff *skb) { int ret = ip_mc_check_iphdr(skb); @@ -1615,7 +1589,11 @@ int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) if (ip_hdr(skb)->protocol != IPPROTO_IGMP) return -ENOMSG; - return __ip_mc_check_igmp(skb, skb_trimmed); + ret = ip_mc_check_igmp_csum(skb); + if (ret < 0) + return ret; + + return ip_mc_check_igmp_msg(skb); } EXPORT_SYMBOL(ip_mc_check_igmp); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 760a9e52e02b..9f69411251d0 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,6 +25,62 @@ #include <net/sock.h> #include <net/inet_frag.h> #include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/ipv6.h> + +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + union { + struct inet_skb_parm h4; + struct inet6_skb_parm h6; + }; + struct sk_buff *next_frag; + int frag_run_len; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +static void fragcb_clear(struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void fragrun_append_to_last(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + fragcb_clear(skb); + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + fragcb_clear(skb); + + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + q->fragments_tail = skb; + q->last_run_head = skb; +} /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. @@ -123,6 +179,28 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) kmem_cache_free(f->frags_cachep, q); } +unsigned int inet_frag_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb(skb); + skb = next; + } + } + return sum; +} +EXPORT_SYMBOL(inet_frag_rbtree_purge); + void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; @@ -224,3 +302,218 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) return fq; } EXPORT_SYMBOL(inet_frag_find); + +int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, + int offset, int end) +{ + struct sk_buff *last = q->fragments_tail; + + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. + * + * Duplicates, however, should be ignored (i.e. skb dropped, but the + * queue/fragments kept for later reassembly). + */ + if (!last) + fragrun_create(q, skb); /* First fragment. */ + else if (last->ip_defrag_offset + last->len < end) { + /* This is the common case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < last->ip_defrag_offset + last->len) + return IPFRAG_OVERLAP; + if (offset == last->ip_defrag_offset + last->len) + fragrun_append_to_last(q, skb); + else + fragrun_create(q, skb); + } else { + /* Binary search. Note that skb can become the first fragment, + * but not the last (covered above). + */ + struct rb_node **rbn, *parent; + + rbn = &q->rb_fragments.rb_node; + do { + struct sk_buff *curr; + int curr_run_end; + + parent = *rbn; + curr = rb_to_skb(parent); + curr_run_end = curr->ip_defrag_offset + + FRAG_CB(curr)->frag_run_len; + if (end <= curr->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= curr_run_end) + rbn = &parent->rb_right; + else if (offset >= curr->ip_defrag_offset && + end <= curr_run_end) + return IPFRAG_DUP; + else + return IPFRAG_OVERLAP; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. + */ + fragcb_clear(skb); + rb_link_node(&skb->rbnode, parent, rbn); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + } + + skb->ip_defrag_offset = offset; + + return IPFRAG_OK; +} +EXPORT_SYMBOL(inet_frag_queue_insert); + +void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, + struct sk_buff *parent) +{ + struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); + struct sk_buff **nextp; + int delta; + + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); + if (!fp) + return NULL; + FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; + if (RB_EMPTY_NODE(&skb->rbnode)) + FRAG_CB(parent)->next_frag = fp; + else + rb_replace_node(&skb->rbnode, &fp->rbnode, + &q->rb_fragments); + if (q->fragments_tail == skb) + q->fragments_tail = fp; + skb_morph(skb, head); + FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + consume_skb(head); + head = skb; + } + WARN_ON(head->ip_defrag_offset != 0); + + delta = -head->truesize; + + /* Head of list must not be cloned. */ + if (skb_unclone(head, GFP_ATOMIC)) + return NULL; + + delta += head->truesize; + if (delta) + add_frag_mem_limit(q->net, delta); + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. + */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + clone = alloc_skb(0, GFP_ATOMIC); + if (!clone) + return NULL; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); + clone->data_len = head->data_len - plen; + clone->len = clone->data_len; + head->truesize += clone->truesize; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + add_frag_mem_limit(q->net, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; + } + + return nextp; +} +EXPORT_SYMBOL(inet_frag_reasm_prepare); + +void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, + void *reasm_data) +{ + struct sk_buff **nextp = (struct sk_buff **)reasm_data; + struct rb_node *rbn; + struct sk_buff *fp; + + skb_push(head, head->data - skb_network_header(head)); + + /* Traverse the tree in order, to build frag_list. */ + fp = FRAG_CB(head)->next_frag; + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &q->rb_fragments); + while (rbn || fp) { + /* fp points to the next sk_buff in the current run; + * rbn points to the next run. + */ + /* Go through the current run. */ + while (fp) { + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + fp->sk = NULL; + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + fp = FRAG_CB(fp)->next_frag; + } + /* Move to the next run. */ + if (rbn) { + struct rb_node *rbnext = rb_next(rbn); + + fp = rb_to_skb(rbn); + rb_erase(rbn, &q->rb_fragments); + rbn = rbnext; + } + } + sub_frag_mem_limit(q->net, head->truesize); + + *nextp = NULL; + skb_mark_not_on_list(head); + head->prev = NULL; + head->tstamp = q->stamp; +} +EXPORT_SYMBOL(inet_frag_reasm_finish); + +struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) +{ + struct sk_buff *head; + + if (q->fragments) { + head = q->fragments; + q->fragments = head->next; + } else { + struct sk_buff *skb; + + head = skb_rb_first(&q->rb_fragments); + if (!head) + return NULL; + skb = FRAG_CB(head)->next_frag; + if (skb) + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + else + rb_erase(&head->rbnode, &q->rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + } + if (head == q->fragments_tail) + q->fragments_tail = NULL; + + sub_frag_mem_limit(q->net, head->truesize); + + return head; +} +EXPORT_SYMBOL(inet_frag_pull_head); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 867be8f7f1fa..486ecb0aeb87 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,57 +57,6 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; -/* Use skb->cb to track consecutive/adjacent fragments coming at - * the end of the queue. Nodes in the rb-tree queue will - * contain "runs" of one or more adjacent fragments. - * - * Invariants: - * - next_frag is NULL at the tail of a "run"; - * - the head of a "run" has the sum of all fragment lengths in frag_run_len. - */ -struct ipfrag_skb_cb { - struct inet_skb_parm h; - struct sk_buff *next_frag; - int frag_run_len; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - -static void ip4_frag_init_run(struct sk_buff *skb) -{ - BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); - - FRAG_CB(skb)->next_frag = NULL; - FRAG_CB(skb)->frag_run_len = skb->len; -} - -/* Append skb to the last "run". */ -static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, - struct sk_buff *skb) -{ - RB_CLEAR_NODE(&skb->rbnode); - FRAG_CB(skb)->next_frag = NULL; - - FRAG_CB(q->last_run_head)->frag_run_len += skb->len; - FRAG_CB(q->fragments_tail)->next_frag = skb; - q->fragments_tail = skb; -} - -/* Create a new "run" with the skb. */ -static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) -{ - if (q->last_run_head) - rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, - &q->last_run_head->rbnode.rb_right); - else - rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); - rb_insert_color(&skb->rbnode, &q->rb_fragments); - - ip4_frag_init_run(skb); - q->fragments_tail = skb; - q->last_run_head = skb; -} - /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -212,27 +161,9 @@ static void ip_expire(struct timer_list *t) * pull the head out of the tree in order to be able to * deal with head->dev. */ - if (qp->q.fragments) { - head = qp->q.fragments; - qp->q.fragments = head->next; - } else { - head = skb_rb_first(&qp->q.rb_fragments); - if (!head) - goto out; - if (FRAG_CB(head)->next_frag) - rb_replace_node(&head->rbnode, - &FRAG_CB(head)->next_frag->rbnode, - &qp->q.rb_fragments); - else - rb_erase(&head->rbnode, &qp->q.rb_fragments); - memset(&head->rbnode, 0, sizeof(head->rbnode)); - barrier(); - } - if (head == qp->q.fragments_tail) - qp->q.fragments_tail = NULL; - - sub_frag_mem_limit(qp->q.net, head->truesize); - + head = inet_frag_pull_head(&qp->q); + if (!head) + goto out; head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; @@ -344,12 +275,10 @@ static int ip_frag_reinit(struct ipq *qp) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct rb_node **rbn, *parent; - struct sk_buff *skb1, *prev_tail; - int ihl, end, skb1_run_end; + int ihl, end, flags, offset; + struct sk_buff *prev_tail; struct net_device *dev; unsigned int fragsize; - int flags, offset; int err = -ENOENT; u8 ecn; @@ -413,62 +342,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) /* Makes sure compiler wont do silly aliasing games */ barrier(); - /* RFC5722, Section 4, amended by Errata ID : 3089 - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments) MUST be silently discarded. - * - * We do the same here for IPv4 (and increment an snmp counter) but - * we do not want to drop the whole queue in response to a duplicate - * fragment. - */ - - err = -EINVAL; - /* Find out where to put this fragment. */ prev_tail = qp->q.fragments_tail; - if (!prev_tail) - ip4_frag_create_run(&qp->q, skb); /* First fragment. */ - else if (prev_tail->ip_defrag_offset + prev_tail->len < end) { - /* This is the common case: skb goes to the end. */ - /* Detect and discard overlaps. */ - if (offset < prev_tail->ip_defrag_offset + prev_tail->len) - goto overlap; - if (offset == prev_tail->ip_defrag_offset + prev_tail->len) - ip4_frag_append_to_last_run(&qp->q, skb); - else - ip4_frag_create_run(&qp->q, skb); - } else { - /* Binary search. Note that skb can become the first fragment, - * but not the last (covered above). - */ - rbn = &qp->q.rb_fragments.rb_node; - do { - parent = *rbn; - skb1 = rb_to_skb(parent); - skb1_run_end = skb1->ip_defrag_offset + - FRAG_CB(skb1)->frag_run_len; - if (end <= skb1->ip_defrag_offset) - rbn = &parent->rb_left; - else if (offset >= skb1_run_end) - rbn = &parent->rb_right; - else if (offset >= skb1->ip_defrag_offset && - end <= skb1_run_end) - goto err; /* No new data, potential duplicate */ - else - goto overlap; /* Found an overlap */ - } while (*rbn); - /* Here we have parent properly set, and rbn pointing to - * one of its NULL left/right children. Insert skb. - */ - ip4_frag_init_run(skb); - rb_link_node(&skb->rbnode, parent, rbn); - rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - } + err = inet_frag_queue_insert(&qp->q, skb, offset, end); + if (err) + goto insert_error; if (dev) qp->iif = dev->ifindex; - skb->ip_defrag_offset = offset; qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; @@ -501,10 +381,16 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) skb_dst_drop(skb); return -EINPROGRESS; -overlap: +insert_error: + if (err == IPFRAG_DUP) { + kfree_skb(skb); + return -EINVAL; + } + err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: inet_frag_kill(&qp->q); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb(skb); return err; @@ -516,13 +402,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; - struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); - struct sk_buff **nextp; /* To build frag_list. */ - struct rb_node *rbn; - int len; - int ihlen; - int delta; - int err; + void *reasm_data; + int len, err; u8 ecn; ipq_kill(qp); @@ -532,117 +413,23 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, err = -EINVAL; goto out_fail; } - /* Make the one we just received the head. */ - if (head != skb) { - fp = skb_clone(skb, GFP_ATOMIC); - if (!fp) - goto out_nomem; - FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; - if (RB_EMPTY_NODE(&skb->rbnode)) - FRAG_CB(prev_tail)->next_frag = fp; - else - rb_replace_node(&skb->rbnode, &fp->rbnode, - &qp->q.rb_fragments); - if (qp->q.fragments_tail == skb) - qp->q.fragments_tail = fp; - skb_morph(skb, head); - FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; - rb_replace_node(&head->rbnode, &skb->rbnode, - &qp->q.rb_fragments); - consume_skb(head); - head = skb; - } - WARN_ON(head->ip_defrag_offset != 0); - - /* Allocate a new buffer for the datagram. */ - ihlen = ip_hdrlen(head); - len = ihlen + qp->q.len; + /* Make the one we just received the head. */ + reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); + if (!reasm_data) + goto out_nomem; + len = ip_hdrlen(skb) + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; - delta = - head->truesize; - - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - goto out_nomem; - - delta += head->truesize; - if (delta) - add_frag_mem_limit(qp->q.net, delta); - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (!clone) - goto out_nomem; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->truesize += clone->truesize; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - add_frag_mem_limit(qp->q.net, clone->truesize); - skb_shinfo(head)->frag_list = clone; - nextp = &clone->next; - } else { - nextp = &skb_shinfo(head)->frag_list; - } - - skb_push(head, head->data - skb_network_header(head)); + inet_frag_reasm_finish(&qp->q, skb, reasm_data); - /* Traverse the tree in order, to build frag_list. */ - fp = FRAG_CB(head)->next_frag; - rbn = rb_next(&head->rbnode); - rb_erase(&head->rbnode, &qp->q.rb_fragments); - while (rbn || fp) { - /* fp points to the next sk_buff in the current run; - * rbn points to the next run. - */ - /* Go through the current run. */ - while (fp) { - *nextp = fp; - nextp = &fp->next; - fp->prev = NULL; - memset(&fp->rbnode, 0, sizeof(fp->rbnode)); - fp->sk = NULL; - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - fp = FRAG_CB(fp)->next_frag; - } - /* Move to the next run. */ - if (rbn) { - struct rb_node *rbnext = rb_next(rbn); - - fp = rb_to_skb(rbn); - rb_erase(rbn, &qp->q.rb_fragments); - rbn = rbnext; - } - } - sub_frag_mem_limit(qp->q.net, head->truesize); - - *nextp = NULL; - skb_mark_not_on_list(head); - head->prev = NULL; - head->dev = dev; - head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); + skb->dev = dev; + IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); - iph = ip_hdr(head); + iph = ip_hdr(skb); iph->tot_len = htons(len); iph->tos |= ecn; @@ -655,7 +442,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { - IPCB(head)->flags |= IPSKB_FRAG_PMTU; + IPCB(skb)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; @@ -753,28 +540,6 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_check_defrag); -unsigned int inet_frag_rbtree_purge(struct rb_root *root) -{ - struct rb_node *p = rb_first(root); - unsigned int sum = 0; - - while (p) { - struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); - - p = rb_next(p); - rb_erase(&skb->rbnode, root); - while (skb) { - struct sk_buff *next = FRAG_CB(skb)->next_frag; - - sum += skb->truesize; - kfree_skb(skb); - skb = next; - } - } - return sum; -} -EXPORT_SYMBOL(inet_frag_rbtree_purge); - #ifdef CONFIG_SYSCTL static int dist_min; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d1d09f3e5f9e..d1cef66820d3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -268,20 +268,11 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int len; itn = net_generic(net, erspan_net_id); - len = gre_hdr_len + sizeof(*ershdr); - - /* Check based hdr len */ - if (unlikely(!pskb_may_pull(skb, len))) - return PACKET_REJECT; iph = ip_hdr(skb); ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; - /* The original GRE header does not have key field, - * Use ERSPAN 10-bit session ID as key. - */ - tpi->key = cpu_to_be32(get_session_id(ershdr)); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); @@ -458,81 +449,14 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum) return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } -static struct rtable *gre_get_rt(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - const struct ip_tunnel_key *key) -{ - struct net *net = dev_net(dev); - - memset(fl, 0, sizeof(*fl)); - fl->daddr = key->u.ipv4.dst; - fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->tos); - fl->flowi4_mark = skb->mark; - fl->flowi4_proto = IPPROTO_GRE; - - return ip_route_output_key(net, fl); -} - -static struct rtable *prepare_fb_xmit(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - int tunnel_hlen) -{ - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - struct rtable *rt = NULL; - int min_headroom; - bool use_cache; - int err; - - tun_info = skb_tunnel_info(skb); - key = &tun_info->key; - use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); - - if (use_cache) - rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr); - if (!rt) { - rt = gre_get_rt(skb, dev, fl, key); - if (IS_ERR(rt)) - goto err_free_skb; - if (use_cache) - dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, - fl->saddr); - } - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + tunnel_hlen + sizeof(struct iphdr); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - return rt; - -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - dev->stats.tx_dropped++; - return NULL; -} - static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; - struct rtable *rt = NULL; - struct flowi4 fl; int tunnel_hlen; - __be16 df, flags; + __be16 flags; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -542,13 +466,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, key = &tun_info->key; tunnel_hlen = gre_calc_hlen(key->tun_flags); - rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); - if (!rt) - return; + if (skb_cow_head(skb, dev->needed_headroom)) + goto err_free_skb; /* Push Tunnel header. */ if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) - goto err_free_rt; + goto err_free_skb; flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); @@ -556,32 +479,25 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); - iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->tos, key->ttl, df, false); return; -err_free_rt: - ip_rt_put(rt); err_free_skb: kfree_skb(skb); dev->stats.tx_dropped++; } -static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, - __be16 proto) +static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct erspan_metadata *md; - struct rtable *rt = NULL; bool truncate = false; - struct flowi4 fl; + __be16 proto; int tunnel_hlen; int version; - __be16 df; int nhoff; int thoff; @@ -592,21 +508,20 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, key = &tun_info->key; if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) - goto err_free_rt; + goto err_free_skb; md = ip_tunnel_info_opts(tun_info); if (!md) - goto err_free_rt; + goto err_free_skb; /* ERSPAN has fixed 8 byte GRE header */ version = md->version; tunnel_hlen = 8 + erspan_hdr_len(version); - rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); - if (!rt) - return; + if (skb_cow_head(skb, dev->needed_headroom)) + goto err_free_skb; if (gre_handle_offloads(skb, false)) - goto err_free_rt; + goto err_free_skb; if (skb->len > dev->mtu + dev->hard_header_len) { pskb_trim(skb, dev->mtu + dev->hard_header_len); @@ -626,27 +541,25 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (version == 1) { erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), ntohl(md->u.index), truncate, true); + proto = htons(ETH_P_ERSPAN); } else if (version == 2) { erspan_build_header_v2(skb, ntohl(tunnel_id_to_key32(key->tun_id)), md->u.md2.dir, get_hwid(&md->u.md2), truncate, true); + proto = htons(ETH_P_ERSPAN2); } else { - goto err_free_rt; + goto err_free_skb; } gre_build_header(skb, 8, TUNNEL_SEQ, - htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); + proto, 0, htonl(tunnel->o_seqno++)); - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); - iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->tos, key->ttl, df, false); return; -err_free_rt: - ip_rt_put(rt); err_free_skb: kfree_skb(skb); dev->stats.tx_dropped++; @@ -655,13 +568,18 @@ err_free_skb: static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct ip_tunnel_key *key; struct rtable *rt; struct flowi4 fl4; if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; - rt = gre_get_rt(skb, dev, &fl4, &info->key); + key = &info->key; + ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, + tunnel_id_to_key32(key->tun_id), key->tos, 0, + skb->mark); + rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -721,12 +639,13 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; + __be16 proto; if (!pskb_inet_may_pull(skb)) goto free_skb; if (tunnel->collect_md) { - erspan_fb_xmit(skb, dev, skb->protocol); + erspan_fb_xmit(skb, dev); return NETDEV_TX_OK; } @@ -742,19 +661,22 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - if (tunnel->erspan_ver == 1) + if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, truncate, true); - else if (tunnel->erspan_ver == 2) + proto = htons(ETH_P_ERSPAN); + } else if (tunnel->erspan_ver == 2) { erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), tunnel->dir, tunnel->hwid, truncate, true); - else + proto = htons(ETH_P_ERSPAN2); + } else { goto free_skb; + } tunnel->parms.o_flags &= ~TUNNEL_KEY; - __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); + __gre_xmit(skb, dev, &tunnel->parms.iph, proto); return NETDEV_TX_OK; free_skb: diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 26921f6b3b92..cd6b5694f99e 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -429,7 +429,6 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) if (skb->pkt_type == PACKET_OTHERHOST) goto drop; - __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -488,6 +487,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) goto drop; } + iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ @@ -520,6 +520,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c4f5602308ed..893f013d5369 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -501,15 +501,19 @@ EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df, - const struct iphdr *inner_iph) + const struct iphdr *inner_iph, + int tunnel_hlen, __be32 dst, bool md) { struct ip_tunnel *tunnel = netdev_priv(dev); - int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; + int pkt_size; int mtu; + tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; + pkt_size = skb->len - tunnel_hlen - dev->hard_header_len; + if (df) mtu = dst_mtu(&rt->dst) - dev->hard_header_len - - sizeof(struct iphdr) - tunnel->hlen; + - sizeof(struct iphdr) - tunnel_hlen; else mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; @@ -527,11 +531,13 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); + __be32 daddr; + + daddr = md ? dst : tunnel->parms.iph.daddr; if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { - if ((tunnel->parms.iph.daddr && - !ipv4_is_multicast(tunnel->parms.iph.daddr)) || + if ((daddr && !ipv4_is_multicast(daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); @@ -548,17 +554,19 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } -void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) +void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + u8 proto, int tunnel_hlen) { struct ip_tunnel *tunnel = netdev_priv(dev); u32 headroom = sizeof(struct iphdr); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; const struct iphdr *inner_iph; - struct rtable *rt; + struct rtable *rt = NULL; struct flowi4 fl4; __be16 df = 0; u8 tos, ttl; + bool use_cache; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -574,20 +582,39 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } - ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, - RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); + ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, + tunnel_id_to_key32(key->tun_id), RT_TOS(tos), + 0, skb->mark); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; - rt = ip_route_output_key(tunnel->net, &fl4); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error; + + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); + if (!rt) { + rt = ip_route_output_key(tunnel->net, &fl4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + if (use_cache) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { ip_rt_put(rt); dev->stats.collisions++; goto tx_error; } + + if (key->tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, + key->u.ipv4.dst, true)) { + ip_rt_put(rt); + goto tx_error; + } + tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = key->ttl; if (ttl == 0) { @@ -598,10 +625,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) else ttl = ip4_dst_hoplimit(&rt->dst); } - if (key->tun_flags & TUNNEL_DONT_FRAGMENT) - df = htons(IP_DF); - else if (skb->protocol == htons(ETH_P_IP)) + + if (!df && skb->protocol == htons(ETH_P_IP)) df = inner_iph->frag_off & htons(IP_DF); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (headroom > dev->needed_headroom) dev->needed_headroom = headroom; @@ -644,13 +671,19 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ + struct ip_tunnel_info *tun_info; if (!skb_dst(skb)) { dev->stats.tx_fifo_errors++; goto tx_error; } - if (skb->protocol == htons(ETH_P_IP)) { + tun_info = skb_tunnel_info(skb); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && + ip_tunnel_info_af(tun_info) == AF_INET && + tun_info->key.u.ipv4.dst) + dst = tun_info->key.u.ipv4.dst; + else if (skb->protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); dst = rt_nexthop(rt, inner_iph->daddr); } @@ -731,7 +764,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) { + if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph, + 0, 0, false)) { ip_rt_put(rt); goto tx_error; } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index d7b43e700023..68a21bf75dd0 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -74,6 +74,33 @@ drop: return 0; } +static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + struct ip_tunnel *tunnel; + const struct iphdr *iph = ip_hdr(skb); + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + if (tunnel) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + + skb->dev = tunnel->dev; + + return xfrm_input(skb, nexthdr, spi, encap_type); + } + + return -EINVAL; +drop: + kfree_skb(skb); + return 0; +} + static int vti_rcv(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; @@ -82,6 +109,14 @@ static int vti_rcv(struct sk_buff *skb) return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); } +static int vti_rcv_ipip(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0); +} + static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -435,6 +470,12 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .priority = 100, }; +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler = vti_rcv_ipip, + .err_handler = vti4_err, + .priority = 0, +}; + static int __net_init vti_init_net(struct net *net) { int err; @@ -603,6 +644,13 @@ static int __init vti_init(void) if (err < 0) goto xfrm_proto_comp_failed; + msg = "ipip tunnel"; + err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + if (err < 0) { + pr_info("%s: cant't register tunnel\n",__func__); + goto xfrm_tunnel_failed; + } + msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) @@ -612,6 +660,8 @@ static int __init vti_init(void) rtnl_link_failed: xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); +xfrm_tunnel_failed: + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 57c5dd283a2c..fe10b9a2efc8 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -302,7 +302,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, ipproto); if (tunnel->collect_md) - ip_md_tunnel_xmit(skb, dev, ipproto); + ip_md_tunnel_xmit(skb, dev, ipproto, 0); else ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ddbf8c9a1abb..fb99002c3d4e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2467,6 +2467,61 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS); } +static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + + rtm = nlmsg_data(nlh); + if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || + (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || + rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || + rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + if (err) + return err; + + if ((tb[RTA_SRC] && !rtm->rtm_src_len) || + (tb[RTA_DST] && !rtm->rtm_dst_len)) { + NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); + return -EINVAL; + } + + for (i = 0; i <= RTA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case RTA_SRC: + case RTA_DST: + case RTA_TABLE: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request"); + return -EINVAL; + } + } + + return 0; +} + static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2475,18 +2530,14 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct sk_buff *skb = NULL; struct mfc_cache *cache; struct mr_table *mrt; - struct rtmsg *rtm; __be32 src, grp; u32 tableid; int err; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; - rtm = nlmsg_data(nlh); - src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 8d2e5dc9a827..a058213b77a7 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -80,24 +80,6 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t } EXPORT_SYMBOL(ip_route_me_harder); -int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) -{ - const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct iphdr *iph = ip_hdr(skb); - - if (!(iph->tos == rt_info->tos && - skb->mark == rt_info->mark && - iph->daddr == rt_info->daddr && - iph->saddr == rt_info->saddr)) - return ip_route_me_harder(entry->state.net, skb, - RTN_UNSPEC); - } - return 0; -} -EXPORT_SYMBOL_GPL(nf_ip_reroute); - int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict __always_unused) { diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index b61977db9b7f..2a909e5f9ba0 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -846,9 +846,9 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { +#ifdef CONFIG_PROC_FS struct clusterip_net *cn = clusterip_pernet(net); -#ifdef CONFIG_PROC_FS mutex_lock(&cn->mutex); proc_remove(cn->procdir); cn->procdir = NULL; diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 2687db015b6f..e26165af45cb 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -214,7 +214,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, } /* Change outer to look like the reply to an incoming packet */ - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ce92f73cf104..99be68b15da0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2763,6 +2763,75 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, return skb; } +static int inet_rtm_valid_getroute_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, + "ipv4: Invalid header for route get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + + rtm = nlmsg_data(nlh); + if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || + (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || + rtm->rtm_table || rtm->rtm_protocol || + rtm->rtm_scope || rtm->rtm_type) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); + return -EINVAL; + } + + if (rtm->rtm_flags & ~(RTM_F_NOTIFY | + RTM_F_LOOKUP_TABLE | + RTM_F_FIB_MATCH)) { + NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + if (err) + return err; + + if ((tb[RTA_SRC] && !rtm->rtm_src_len) || + (tb[RTA_DST] && !rtm->rtm_dst_len)) { + NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); + return -EINVAL; + } + + for (i = 0; i <= RTA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case RTA_IIF: + case RTA_OIF: + case RTA_SRC: + case RTA_DST: + case RTA_IP_PROTO: + case RTA_SPORT: + case RTA_DPORT: + case RTA_MARK: + case RTA_UID: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); + return -EINVAL; + } + } + + return 0; +} + static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2783,8 +2852,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err; int mark; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, - extack); + err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 541bdb9f81d7..6f8d292ad501 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1127,7 +1127,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp) } static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, - int *copied, size_t size) + int *copied, size_t size, + struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -1147,6 +1148,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; + tp->fastopen_req->uarg = uarg; if (inet->defer_connect) { err = tcp_connect(sk); @@ -1186,11 +1188,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) flags = msg->msg_flags; if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { - if (sk->sk_state != TCP_ESTABLISHED) { - err = -EINVAL; - goto out_err; - } - skb = tcp_write_queue_tail(sk); uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { @@ -1205,7 +1202,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && !tp->repair) { - err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) @@ -1554,7 +1551,7 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && - !icsk->icsk_ack.pingpong)) && + !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } @@ -2987,16 +2984,16 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_QUICKACK: if (!val) { - icsk->icsk_ack.pingpong = 1; + inet_csk_enter_pingpong_mode(sk); } else { - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) - icsk->icsk_ack.pingpong = 1; + inet_csk_enter_pingpong_mode(sk); } } break; @@ -3410,7 +3407,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; } case TCP_QUICKACK: - val = !icsk->icsk_ack.pingpong; + val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 0f497fc49c3f..56be7d27f208 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -115,6 +115,14 @@ struct bbr { unused_b:5; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ + + /* For tracking ACK aggregation: */ + u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ + u16 extra_acked[2]; /* max excess data ACKed in epoch */ + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ + unused_c:6; }; #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ @@ -182,6 +190,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8; /* If we estimate we're policed, use lt_bw for this many round trips: */ static const u32 bbr_lt_bw_max_rtts = 48; +/* Gain factor for adding extra_acked to target cwnd: */ +static const int bbr_extra_acked_gain = BBR_UNIT; +/* Window length of extra_acked window. */ +static const u32 bbr_extra_acked_win_rtts = 5; +/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ +static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; +/* Time period for clamping cwnd increment due to ack aggregation */ +static const u32 bbr_extra_acked_max_us = 100 * 1000; + static void bbr_check_probe_rtt_done(struct sock *sk); /* Do we estimate that STARTUP filled the pipe? */ @@ -208,6 +225,16 @@ static u32 bbr_bw(const struct sock *sk) return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); } +/* Return maximum extra acked in past k-2k round trips, + * where k = bbr_extra_acked_win_rtts. + */ +static u16 bbr_extra_acked(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->extra_acked[0], bbr->extra_acked[1]); +} + /* Return rate in bytes per second, optionally with a gain. * The order here is chosen carefully to avoid overflow of u64. This should * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. @@ -305,6 +332,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) if (event == CA_EVENT_TX_START && tp->app_limited) { bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; /* Avoid pointless buffer overflows: pace at est. bw if we don't * need more speed (we're restarting from idle and app-limited). */ @@ -315,30 +344,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) } } -/* Find target cwnd. Right-size the cwnd based on min RTT and the - * estimated bottleneck bandwidth: +/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: * - * cwnd = bw * min_rtt * gain = BDP * gain + * bdp = bw * min_rtt * gain * * The key factor, gain, controls the amount of queue. While a small gain * builds a smaller queue, it becomes more vulnerable to noise in RTT * measurements (e.g., delayed ACKs or other ACK compression effects). This * noise may cause BBR to under-estimate the rate. - * - * To achieve full performance in high-speed paths, we budget enough cwnd to - * fit full-sized skbs in-flight on both end hosts to fully utilize the path: - * - one skb in sending host Qdisc, - * - one skb in sending host TSO/GSO engine - * - one skb being received by receiver host LRO/GRO/delayed-ACK engine - * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because - * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, - * which allows 2 outstanding 2-packet sequences, to try to keep pipe - * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) +static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) { struct bbr *bbr = inet_csk_ca(sk); - u32 cwnd; + u32 bdp; u64 w; /* If we've never had a valid RTT sample, cap cwnd at the initial @@ -353,7 +371,24 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) w = (u64)bw * bbr->min_rtt_us; /* Apply a gain to the given value, then remove the BW_SCALE shift. */ - cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + return bdp; +} + +/* To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); /* Allow enough full-sized skbs in flight to utilize end systems. */ cwnd += 3 * bbr_tso_segs_goal(sk); @@ -368,6 +403,17 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) return cwnd; } +/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ +static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) +{ + u32 inflight; + + inflight = bbr_bdp(sk, bw, gain); + inflight = bbr_quantization_budget(sk, inflight, gain); + + return inflight; +} + /* With pacing at lower layers, there's often less data "in the network" than * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), * we often have several skbs queued in the pacing layer with a pre-scheduled @@ -401,6 +447,22 @@ static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) return inflight_at_edt - interval_delivered; } +/* Find the cwnd increment based on estimate of ack aggregation */ +static u32 bbr_ack_aggregation_cwnd(struct sock *sk) +{ + u32 max_aggr_cwnd, aggr_cwnd = 0; + + if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; + aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } + + return aggr_cwnd; +} + /* An optimization in BBR to reduce losses: On the first round of recovery, we * follow the packet conservation principle: send P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -461,8 +523,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) goto done; + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems + * due to aggregation (of data and/or ACKs) visible in the ACK stream. + */ + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ - target_cwnd = bbr_target_cwnd(sk, bw, gain); if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ cwnd = min(cwnd + acked, target_cwnd); else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) @@ -503,14 +572,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, if (bbr->pacing_gain > BBR_UNIT) return is_full_length && (rs->losses || /* perhaps pacing_gain*BDP won't fit */ - inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); + inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); /* A pacing_gain < 1.0 tries to drain extra queue we added if bw * probing didn't find more bw. If inflight falls to match BDP then we * estimate queue is drained; persisting would underutilize the pipe. */ return is_full_length || - inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); + inflight <= bbr_inflight(sk, bw, BBR_UNIT); } static void bbr_advance_cycle_phase(struct sock *sk) @@ -727,6 +796,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) } } +/* Estimates the windowed max degree of ack aggregation. + * This is used to provision extra in-flight data to keep sending during + * inter-ACK silences. + * + * Degree of ack aggregation is estimated as extra data acked beyond expected. + * + * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" + * cwnd += max_extra_acked + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round + * trips. + */ +static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +{ + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); + if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; + bbr->extra_acked[bbr->extra_acked_win_idx] = 0; + } + } + + /* Compute how many packets we expected to be delivered over epoch. */ + epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, + bbr->ack_epoch_mstamp); + expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; + + /* Reset the aggregation epoch if ACK rate is below expected rate or + * significantly large no. of ack received since epoch (potentially + * quite old epoch). + */ + if (bbr->ack_epoch_acked <= expected_acked || + (bbr->ack_epoch_acked + rs->acked_sacked >= + bbr_ack_epoch_acked_reset_thresh)) { + bbr->ack_epoch_acked = 0; + bbr->ack_epoch_mstamp = tp->delivered_mstamp; + expected_acked = 0; + } + + /* Compute excess data delivered, beyond what was expected. */ + bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, + bbr->ack_epoch_acked + rs->acked_sacked); + extra_acked = bbr->ack_epoch_acked - expected_acked; + extra_acked = min(extra_acked, tp->snd_cwnd); + if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; +} + /* Estimate when the pipe is full, using the change in delivery rate: BBR * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited @@ -762,11 +892,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { bbr->mode = BBR_DRAIN; /* drain queue we created */ tcp_sk(sk)->snd_ssthresh = - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ } @@ -881,6 +1011,7 @@ static void bbr_update_gains(struct sock *sk) static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) { bbr_update_bw(sk, rs); + bbr_update_ack_aggregation(sk, rs); bbr_update_cycle_phase(sk, rs); bbr_check_full_bw_reached(sk, rs); bbr_check_drain(sk, rs); @@ -932,6 +1063,13 @@ static void bbr_init(struct sock *sk) bbr_reset_lt_bw_sampling(sk); bbr_reset_startup_mode(sk); + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = 0; + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 76858b14ebe9..7a027dec649b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -221,7 +221,7 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk, max_quickacks); - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } EXPORT_SYMBOL(tcp_enter_quickack_mode); @@ -236,7 +236,7 @@ static bool tcp_in_quickack_mode(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); return (dst && dst_metric(dst, RTAX_QUICKACK)) || - (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); + (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) @@ -4094,7 +4094,7 @@ void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; + inet_csk_enter_pingpong_mode(sk); break; case TCP_CLOSE_WAIT: @@ -5889,7 +5889,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, return -1; if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || - icsk->icsk_ack.pingpong) { + inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index efc6fef692ff..662b034f1795 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2437,7 +2437,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) refcount_read(&sk->sk_refcnt), sk, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), - (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, + (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), tp->snd_cwnd, state == TCP_LISTEN ? fastopenq->max_qlen : diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6527f61f59ff..96bdb8eae9bb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -165,13 +165,16 @@ static void tcp_event_data_sent(struct tcp_sock *tp, if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - tp->lsndtime = now; - - /* If it is a reply for ato after last received - * packet, enter pingpong mode. + /* If this is the first data packet sent in response to the + * previous received data, + * and it is a reply for ato after last received packet, + * increase pingpong count. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - icsk->icsk_ack.pingpong = 1; + if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) && + (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + inet_csk_inc_pingpong_cnt(sk); + + tp->lsndtime = now; } /* Account for an ACK we sent. */ @@ -3455,6 +3458,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) skb_trim(syn_data, copied); space = copied; } + skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) @@ -3568,7 +3572,7 @@ void tcp_send_delayed_ack(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; - if (icsk->icsk_ack.pingpong || + if (inet_csk_in_pingpong_mode(sk) || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d7399a89469d..f0c86398e6a7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -277,14 +277,14 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { - if (!icsk->icsk_ack.pingpong) { + if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_mstamp_refresh(tcp_sk(sk)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3fb0ed5e4789..5c3cd5d84a6f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -847,15 +847,23 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) + if (hlen + cork->gso_size > cork->fragsize) { + kfree_skb(skb); return -EINVAL; - if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) + } + if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) { + kfree_skb(skb); return -EINVAL; - if (sk->sk_no_check_tx) + } + if (sk->sk_no_check_tx) { + kfree_skb(skb); return -EINVAL; + } if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) + dst_xfrm(skb_dst(skb))) { + kfree_skb(skb); return -EIO; + } skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; @@ -1918,7 +1926,7 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) } EXPORT_SYMBOL(udp_lib_rehash); -static void udp_v4_rehash(struct sock *sk) +void udp_v4_rehash(struct sock *sk) { u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 322672655419..6b2fa77eeb1c 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -10,6 +10,7 @@ int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); +void udp_v4_rehash(struct sock *sk); int udp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 39c7f17d916f..3c94b8f0ff27 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -53,6 +53,7 @@ struct proto udplite_prot = { .sendpage = udp_sendpage, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, |