diff options
Diffstat (limited to 'net/ipv4')
37 files changed, 435 insertions, 302 deletions
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 4dd95cdd8070..c01fa791260d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -461,9 +461,9 @@ static int ah4_err(struct sk_buff *skb, u32 info) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); + ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH); else - ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); + ipv4_redirect(skb, net, 0, IPPROTO_AH); xfrm_state_put(x); return 0; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 82178cc69c96..777fa3b7fb13 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, * * Description: * Parse the packet's IP header looking for a CIPSO option. Returns a pointer - * to the start of the CIPSO option on success, NULL if one if not found. + * to the start of the CIPSO option on success, NULL if one is not found. * */ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) @@ -1522,10 +1522,8 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) int optlen; int taglen; - for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) { + for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) { switch (optptr[0]) { - case IPOPT_CIPSO: - return optptr; case IPOPT_END: return NULL; case IPOPT_NOOP: @@ -1534,6 +1532,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) default: taglen = optptr[1]; } + if (!taglen || taglen > optlen) + return NULL; + if (optptr[0] == IPOPT_CIPSO) + return optptr; + optlen -= taglen; optptr += taglen; } diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index f915abff1350..300921417f89 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { - if (!oif) + if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 44d931a3cd50..d122ebbe5980 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -782,7 +782,8 @@ static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, } static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, - __u32 *pvalid_lft, __u32 *pprefered_lft) + __u32 *pvalid_lft, __u32 *pprefered_lft, + struct netlink_ext_ack *extack) { struct nlattr *tb[IFA_MAX+1]; struct in_ifaddr *ifa; @@ -792,7 +793,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, int err; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, - NULL); + extack); if (err < 0) goto errout; @@ -897,7 +898,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ASSERT_RTNL(); - ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft); + ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft, extack); if (IS_ERR(ifa)) return PTR_ERR(ifa); @@ -1659,17 +1660,70 @@ nla_put_failure: return -EMSGSIZE; } +static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, + struct inet_fill_args *fillargs, + struct net **tgt_net, struct sock *sk, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFA_MAX+1]; + struct ifaddrmsg *ifm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); + return -EINVAL; + } + if (ifm->ifa_index) { + NL_SET_ERR_MSG(extack, "ipv4: Filter by device index not supported for address dump"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv4_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= IFA_MAX; ++i) { + if (!tb[i]) + continue; + + if (i == IFA_TARGET_NETNSID) { + struct net *net; + + fillargs->netnsid = nla_get_s32(tb[i]); + + net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); + if (IS_ERR(net)) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); + return PTR_ERR(net); + } + *tgt_net = net; + } else { + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, - .seq = cb->nlh->nlmsg_seq, + .seq = nlh->nlmsg_seq, .event = RTM_NEWADDR, .flags = NLM_F_MULTI, .netnsid = -1, }; struct net *net = sock_net(skb->sk); - struct nlattr *tb[IFA_MAX+1]; struct net *tgt_net = net; int h, s_h; int idx, s_idx; @@ -1683,16 +1737,13 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) s_idx = idx = cb->args[1]; s_ip_idx = ip_idx = cb->args[2]; - if (nlmsg_parse(cb->nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX, - ifa_ipv4_policy, NULL) >= 0) { - if (tb[IFA_TARGET_NETNSID]) { - fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]); + if (cb->strict_check) { + int err; - tgt_net = rtnl_get_net_ns_capable(skb->sk, - fillargs.netnsid); - if (IS_ERR(tgt_net)) - return PTR_ERR(tgt_net); - } + err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, + skb->sk, cb->extack); + if (err < 0) + return err; } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { @@ -2035,6 +2086,7 @@ errout: static int inet_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int h, s_h; int idx, s_idx; @@ -2042,6 +2094,21 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, struct in_device *in_dev; struct hlist_head *head; + if (cb->strict_check) { + struct netlink_ext_ack *extack = cb->extack; + struct netconfmsg *ncm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ncm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request"); + return -EINVAL; + } + } + s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -2061,7 +2128,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, if (inet_netconf_fill_devconf(skb, dev->ifindex, &in_dev->cnf, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) { @@ -2078,7 +2145,7 @@ cont: if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; @@ -2089,7 +2156,7 @@ cont: if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 97689012b357..9e1c840596c5 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -683,12 +683,11 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err) */ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) { - struct ip_esp_hdr *esph; struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; int ivlen = crypto_aead_ivsize(aead); - int elen = skb->len - sizeof(*esph) - ivlen; + int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen; int nfrags; int assoclen; int seqhilen; @@ -698,13 +697,13 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct scatterlist *sg; int err = -EINVAL; - if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) + if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen)) goto out; if (elen <= 0) goto out; - assoclen = sizeof(*esph); + assoclen = sizeof(struct ip_esp_hdr); seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { @@ -820,9 +819,9 @@ static int esp4_err(struct sk_buff *skb, u32 info) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); + ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP); else - ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); + ipv4_redirect(skb, net, 0, IPPROTO_ESP); xfrm_state_put(x); return 0; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 2998b0e47d4b..038f511c73fa 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -315,6 +315,32 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } +bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) +{ + bool dev_match = false; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int ret; + + for (ret = 0; ret < fi->fib_nhs; ret++) { + struct fib_nh *nh = &fi->fib_nh[ret]; + + if (nh->nh_dev == dev) { + dev_match = true; + break; + } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + dev_match = true; + break; + } + } +#else + if (fi->fib_nh[0].nh_dev == dev) + dev_match = true; +#endif + + return dev_match; +} +EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); + /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -361,24 +387,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; fib_combine_itag(itag, &res); - dev_match = false; - -#ifdef CONFIG_IP_ROUTE_MULTIPATH - for (ret = 0; ret < res.fi->fib_nhs; ret++) { - struct fib_nh *nh = &res.fi->fib_nh[ret]; - if (nh->nh_dev == dev) { - dev_match = true; - break; - } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { - dev_match = true; - break; - } - } -#else - if (FIB_RES_DEV(res) == dev) - dev_match = true; -#endif + dev_match = fib_info_nh_uses_dev(res.fi, dev); if (dev_match) { ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; return ret; @@ -792,8 +802,40 @@ errout: return err; } +int ip_valid_fib_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); + return -EINVAL; + } + + rtm = nlmsg_data(nlh); + if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || + rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || + rtm->rtm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); + return -EINVAL; + } + if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { + NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in FIB dump request"); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); + static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; @@ -801,8 +843,14 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_head *head; int dumped = 0, err; - if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && - ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) + if (cb->strict_check) { + err = ip_valid_fib_dump_req(nlh, cb->extack); + if (err < 0) + return err; + } + + if (nlmsg_len(nlh) >= sizeof(struct rtmsg) && + ((struct rtmsg *)nlmsg_data(nlh))->rtm_flags & RTM_F_CLONED) return skb->len; s_h = cb->args[0]; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index bee8db979195..f8c7ec8171a8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -208,7 +208,6 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); - struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -219,9 +218,8 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - m = fi->fib_metrics; - if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) - kfree(m); + ip_fib_metrics_put(fi->fib_metrics); + kfree(fi); } @@ -1020,13 +1018,6 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) return true; } -static int -fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) -{ - return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, - fi->fib_metrics->metrics); -} - struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack) { @@ -1084,16 +1075,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (!fi) goto failure; - if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); - if (unlikely(!fi->fib_metrics)) { - kfree(fi); - return ERR_PTR(err); - } - refcount_set(&fi->fib_metrics->refcnt, 1); - } else { - fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; + fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, + cfg->fc_mx_len); + if (unlikely(IS_ERR(fi->fib_metrics))) { + err = PTR_ERR(fi->fib_metrics); + kfree(fi); + return ERR_PTR(err); } + fib_info_cnt++; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1112,10 +1101,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; } endfor_nexthops(fi) - err = fib_convert_metrics(fi, cfg); - if (err) - goto failure; - if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 695979b7ef6d..d832beed6e3a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1098,9 +1098,9 @@ void icmp_err(struct sk_buff *skb, u32 info) } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); + ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP); else if (type == ICMP_REDIRECT) - ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); + ipv4_redirect(skb, net, 0, IPPROTO_ICMP); } /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dfd5009f96ef..15e7f7915a21 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -544,7 +544,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, struct ip_options_rcu *opt; struct rtable *rt; - opt = ireq_opt_deref(ireq); + rcu_read_lock(); + opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, @@ -558,11 +559,13 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; + rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: + rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index da930b01a147..9b0158fa431f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -260,8 +260,7 @@ out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); - if (head) - kfree_skb(head); + kfree_skb(head); ipq_put(qp); } @@ -823,7 +822,6 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[0].data = &net->ipv4.frags.high_thresh; table[0].extra1 = &net->ipv4.frags.low_thresh; - table[0].extra2 = &init_net.ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c3385a84f8ff..38befe829caf 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -239,12 +239,12 @@ static void gre_err(struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, - skb->dev->ifindex, 0, IPPROTO_GRE, 0); + skb->dev->ifindex, IPPROTO_GRE); return; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, - IPPROTO_GRE, 0); + ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, + IPPROTO_GRE); return; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c0fe5ad996f2..26c36cccabdc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -149,7 +149,6 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; - const struct iphdr *iph = ip_hdr(skb); __be16 *ports; int end; @@ -164,7 +163,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) ports = (__be16 *)skb_transport_header(skb); sin.sin_family = AF_INET; - sin.sin_addr.s_addr = iph->daddr; + sin.sin_addr.s_addr = ip_hdr(skb)->daddr; sin.sin_port = ports[1]; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c4f5602308ed..284a22154b4e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -627,6 +627,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); + unsigned int inner_nhdr_len = 0; const struct iphdr *inner_iph; struct flowi4 fl4; u8 tos, ttl; @@ -636,6 +637,14 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, __be32 dst; bool connected; + /* ensure we can access the inner net header, for several users below */ + if (skb->protocol == htons(ETH_P_IP)) + inner_nhdr_len = sizeof(struct iphdr); + else if (skb->protocol == htons(ETH_P_IPV6)) + inner_nhdr_len = sizeof(struct ipv6hdr); + if (unlikely(!pskb_may_pull(skb, inner_nhdr_len))) + goto tx_error; + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index f38cb21d773d..de31b302d69c 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -318,9 +318,9 @@ static int vti4_err(struct sk_buff *skb, u32 info) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + ipv4_update_pmtu(skb, net, info, 0, protocol); else - ipv4_redirect(skb, net, 0, 0, protocol, 0); + ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index d97f4f2787f5..9119d012ba46 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -48,9 +48,9 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); + ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP); else - ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); + ipv4_redirect(skb, net, 0, IPPROTO_COMP); xfrm_state_put(x); return 0; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c891235b4966..e65287c27e3d 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -175,13 +175,12 @@ static int ipip_err(struct sk_buff *skb, u32 info) } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - ipv4_update_pmtu(skb, net, info, t->parms.link, 0, - iph->protocol, 0); + ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0); + ipv4_redirect(skb, net, t->parms.link, iph->protocol); goto out; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5660adcf7a04..91b0d5671649 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2527,6 +2527,13 @@ errout_free: static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { + if (cb->strict_check) { + int err = ip_valid_fib_dump_req(cb->nlh, cb->extack); + + if (err < 0) + return err; + } + return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, _ipmr_fill_mroute, &mfc_unres_lock); } @@ -2710,6 +2717,31 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) return true; } +static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); + return -EINVAL; + } + + return 0; +} + static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -2718,6 +2750,13 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) unsigned int e = 0, s_e; struct mr_table *mrt; + if (cb->strict_check) { + int err = ipmr_valid_dumplink(cb->nlh, cb->extack); + + if (err < 0) + return err; + } + s_t = cb->args[0]; s_e = cb->args[1]; diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 04311f7067e2..6d218f5a2e71 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -5,8 +5,8 @@ #include <net/net_namespace.h> #include <net/tcp.h> -int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, - u32 *metrics) +static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, + int fc_mx_len, u32 *metrics) { bool ecn_ca = false; struct nlattr *nla; @@ -52,4 +52,28 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, return 0; } -EXPORT_SYMBOL_GPL(ip_metrics_convert); + +struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, + int fc_mx_len) +{ + struct dst_metrics *fib_metrics; + int err; + + if (!fc_mx) + return (struct dst_metrics *)&dst_default_metrics; + + fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL); + if (unlikely(!fib_metrics)) + return ERR_PTR(-ENOMEM); + + err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics); + if (!err) { + refcount_set(&fib_metrics->refcnt, 1); + } else { + kfree(fib_metrics); + fib_metrics = ERR_PTR(err); + } + + return fib_metrics; +} +EXPORT_SYMBOL_GPL(ip_fib_metrics_init); diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 12843c9ef142..0b10d8812828 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -36,7 +36,6 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; - bool dev_match; int ret __maybe_unused; if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) @@ -46,21 +45,7 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL)) return false; } - dev_match = false; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - for (ret = 0; ret < res.fi->fib_nhs; ret++) { - struct fib_nh *nh = &res.fi->fib_nh[ret]; - - if (nh->nh_dev == dev) { - dev_match = true; - break; - } - } -#else - if (FIB_RES_DEV(res) == dev) - dev_match = true; -#endif - return dev_match || flags & XT_RPFILTER_LOOSE; + return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE; } static bool diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index e50976e3c213..94eb25bc8d7e 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -76,10 +76,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, .flowi4_iif = LOOPBACK_IFINDEX, }; const struct net_device *oif; - struct net_device *found; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - int i; -#endif + const struct net_device *found; /* * Do not set flowi4_oif, it restricts results (for example, asking @@ -146,25 +143,13 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (!oif) { found = FIB_RES_DEV(res); - goto ok; - } - -#ifdef CONFIG_IP_ROUTE_MULTIPATH - for (i = 0; i < res.fi->fib_nhs; i++) { - struct fib_nh *nh = &res.fi->fib_nh[i]; + } else { + if (!fib_info_nh_uses_dev(res.fi, oif)) + return; - if (nh->nh_dev == oif) { - found = nh->nh_dev; - goto ok; - } + found = oif; } - return; -#else - found = FIB_RES_DEV(res); - if (found != oif) - return; -#endif -ok: + switch (priv->result) { case NFT_FIB_RESULT_OIF: *dest = found->ifindex; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8d7aaf118a30..7ccb5f87f70b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33df4d76db2d..8ca3eb06ba04 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) tos |= RTO_ONLINK; if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b678466da451..f71d2395c428 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1040,17 +1040,15 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, - int oif, u32 mark, u8 protocol, int flow_flags) + int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - - if (!mark) - mark = IP4_REPLY_MARK(net, skb->mark); + u32 mark = IP4_REPLY_MARK(net, skb->mark); __build_flow_key(net, &fl4, NULL, iph, oif, - RT_TOS(iph->tos), protocol, mark, flow_flags); + RT_TOS(iph->tos), protocol, mark, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); @@ -1132,14 +1130,14 @@ out: EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, - int oif, u32 mark, u8 protocol, int flow_flags) + int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(net, &fl4, NULL, iph, oif, - RT_TOS(iph->tos), protocol, mark, flow_flags); + RT_TOS(iph->tos), protocol, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); @@ -1219,18 +1217,15 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) src = ip_hdr(skb)->saddr; else { struct fib_result res; - struct flowi4 fl4; - struct iphdr *iph; - - iph = ip_hdr(skb); - - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = iph->daddr; - fl4.saddr = iph->saddr; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_oif = rt->dst.dev->ifindex; - fl4.flowi4_iif = skb->dev->ifindex; - fl4.flowi4_mark = skb->mark; + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = RT_TOS(iph->tos), + .flowi4_oif = rt->dst.dev->ifindex, + .flowi4_iif = skb->dev->ifindex, + .flowi4_mark = skb->mark, + }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) @@ -1481,12 +1476,9 @@ void rt_del_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { - struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *)dst; - if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) - kfree(p); - + ip_dst_metrics_put(dst); rt_del_uncached_list(rt); } @@ -1533,11 +1525,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); - if (fi->fib_metrics != &dst_default_metrics) { - rt->dst._metrics |= DST_METRICS_REFCOUNTED; - refcount_inc(&fi->fib_metrics->refcnt); - } + ip_dst_init_metrics(&rt->dst, fi->fib_metrics); + #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif @@ -2785,7 +2774,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi4 fl4; + struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; kuid_t uid; @@ -2825,7 +2814,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; - memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = rtm->rtm_tos; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c3387dfd725b..606f868d9f3f 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req) ts <<= TSBITS; ts |= options; } - return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ); + return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b92f422f2fa8..891ed2f91467 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -48,6 +48,7 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; +static u32 u32_max_div_HZ = UINT_MAX / HZ; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -745,9 +746,10 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_probe_interval", .data = &init_net.ipv4.sysctl_tcp_probe_interval, - .maxlen = sizeof(int), + .maxlen = sizeof(u32), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec_minmax, + .extra2 = &u32_max_div_HZ, }, { .procname = "igmp_link_local_mcast_reports", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 67670fac7c8d..43ef83b2330e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1295,7 +1295,7 @@ new_segment: copy = size_goal; /* All packets are restored as if they have - * already been sent. skb_mstamp isn't set to + * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation. */ if (tp->repair) @@ -1753,6 +1753,7 @@ static int tcp_zerocopy_receive(struct sock *sk, struct vm_area_struct *vma; struct sk_buff *skb = NULL; struct tcp_sock *tp; + int inq; int ret; if (address & (PAGE_SIZE - 1) || address != zc->address) @@ -1773,12 +1774,15 @@ static int tcp_zerocopy_receive(struct sock *sk, tp = tcp_sk(sk); seq = tp->copied_seq; - zc->length = min_t(u32, zc->length, tcp_inq(sk)); + inq = tcp_inq(sk); + zc->length = min_t(u32, zc->length, inq); zc->length &= ~(PAGE_SIZE - 1); - - zap_page_range(vma, address, zc->length); - - zc->recv_skip_hint = 0; + if (zc->length) { + zap_page_range(vma, address, zc->length); + zc->recv_skip_hint = 0; + } else { + zc->recv_skip_hint = inq; + } ret = 0; while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { @@ -1801,8 +1805,17 @@ static int tcp_zerocopy_receive(struct sock *sk, frags++; } } - if (frags->size != PAGE_SIZE || frags->page_offset) + if (frags->size != PAGE_SIZE || frags->page_offset) { + int remaining = zc->recv_skip_hint; + + while (remaining && (frags->size != PAGE_SIZE || + frags->page_offset)) { + remaining -= frags->size; + frags++; + } + zc->recv_skip_hint -= remaining; break; + } ret = vm_insert_page(vma, address + length, skb_frag_page(frags)); if (ret) @@ -2403,16 +2416,10 @@ adjudge_to_death: sock_hold(sk); sock_orphan(sk); - /* It is the last release_sock in its life. It will remove backlog. */ - release_sock(sk); - - - /* Now socket is owned by kernel and we acquire BH lock - * to finish close. No need to check for user refs. - */ local_bh_disable(); bh_lock_sock(sk); - WARN_ON(sock_owned_by_user(sk)); + /* remove backlog if any, without releasing ownership. */ + __release_sock(sk); percpu_counter_inc(sk->sk_prot->orphan_count); @@ -2481,6 +2488,7 @@ adjudge_to_death: out: bh_unlock_sock(sk); local_bh_enable(); + release_sock(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); @@ -3896,8 +3904,8 @@ void __init tcp_init(void) init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - init_net.ipv4.sysctl_tcp_rmem[1] = 87380; - init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[1] = 131072; + init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 02ff2dde9609..a5786e3e2c16 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200; /* Skip TSO below the following bandwidth (bits/sec): */ static const int bbr_min_tso_rate = 1200000; +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +static const int bbr_pacing_marging_percent = 1; + /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain * that will allow a smoothly increasing pacing rate that will double each RTT * and send the same number of packets per RTT that an un-paced, slow-starting @@ -208,12 +211,10 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { unsigned int mss = tcp_sk(sk)->mss_cache; - if (!tcp_needs_internal_pacing(sk)) - mss = tcp_mss_to_mtu(sk, mss); rate *= mss; rate *= gain; rate >>= BBR_SCALE; - rate *= USEC_PER_SEC; + rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_marging_percent); return rate >> BW_SCALE; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d9034073138c..188980c58f87 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -426,26 +426,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) } } -/* 3. Tuning rcvbuf, when connection enters established state. */ -static void tcp_fixup_rcvbuf(struct sock *sk) -{ - u32 mss = tcp_sk(sk)->advmss; - int rcvmem; - - rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * - tcp_default_init_rwnd(mss); - - /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency - * Allow enough cushion so that sender is not limited by our window - */ - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) - rcvmem <<= 2; - - if (sk->sk_rcvbuf < rcvmem) - sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); -} - -/* 4. Try to fixup all. It is made immediately after connection enters +/* 3. Try to fixup all. It is made immediately after connection enters * established state. */ void tcp_init_buffer_space(struct sock *sk) @@ -454,12 +435,10 @@ void tcp_init_buffer_space(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); int maxwin; - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) - tcp_fixup_rcvbuf(sk); if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); - tp->rcvq_space.space = tp->rcv_wnd; + tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -485,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_jiffies32; } -/* 5. Recalculate window clamp after socket hit its memory bounds. */ +/* 4. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1305,7 +1284,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - skb->skb_mstamp); + tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) @@ -1580,7 +1559,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - skb->skb_mstamp); + tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) list_del_init(&skb->tcp_tsorted_anchor); @@ -3103,7 +3082,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; } else if (!(sacked & TCPCB_SACKED_ACKED)) { - last_ackt = skb->skb_mstamp; + last_ackt = tcp_skb_timestamp_us(skb); WARN_ON_ONCE(last_ackt == 0); if (!first_ackt) first_ackt = last_ackt; @@ -3121,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, - skb->skb_mstamp); + tcp_skb_timestamp_us(skb)); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3215,7 +3194,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } } else if (skb && rtt_update && sack_rtt_us >= 0 && - sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, + tcp_skb_timestamp_us(skb))) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -6022,11 +6002,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (th->fin) goto discard; /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH right there. + * so we need to make sure to disable BH and RCU right there. */ + rcu_read_lock(); local_bh_disable(); acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); + rcu_read_unlock(); if (!acceptable) return 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 09547ef9c4c6..de47038afdf0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -544,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) BUG_ON(!skb); tcp_mstamp_refresh(tp); - delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); + delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); @@ -943,9 +943,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 597dbd749f05..059b67af28b1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,6 +45,22 @@ #include <trace/events/tcp.h> +/* Refresh clocks of a TCP socket, + * ensuring monotically increasing values. + */ +void tcp_mstamp_refresh(struct tcp_sock *tp) +{ + u64 val = tcp_clock_ns(); + + /* departure time for next data packet */ + if (val > tp->tcp_wstamp_ns) + tp->tcp_wstamp_ns = val; + + val = div_u64(val, NSEC_PER_USEC); + if (val > tp->tcp_mstamp) + tp->tcp_mstamp = val; +} + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -179,21 +195,6 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } - -u32 tcp_default_init_rwnd(u32 mss) -{ - /* Initial receive window should be twice of TCP_INIT_CWND to - * enable proper sending of new unsent data during fast recovery - * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a - * limit when mss is larger than 1460. - */ - u32 init_rwnd = TCP_INIT_CWND * 2; - - if (mss > 1460) - init_rwnd = max((1460 * init_rwnd) / mss, 2U); - return init_rwnd; -} - /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. @@ -228,7 +229,10 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else - (*rcv_wnd) = space; + (*rcv_wnd) = min_t(u32, space, U16_MAX); + + if (init_rcv_wnd) + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); (*rcv_wscale) = 0; if (wscale_ok) { @@ -241,11 +245,6 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, (*rcv_wscale)++; } } - - if (!init_rcv_wnd) /* Use default unless specified otherwise */ - init_rcv_wnd = tcp_default_init_rwnd(mss); - *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - /* Set the clamp no higher than max representable value */ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); } @@ -977,28 +976,34 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) +static void tcp_internal_pacing(struct sock *sk) { - u64 len_ns; - u32 rate; - if (!tcp_needs_internal_pacing(sk)) return; - rate = sk->sk_pacing_rate; - if (!rate || rate == ~0U) - return; - - len_ns = (u64)skb->len * NSEC_PER_SEC; - do_div(len_ns, rate); hrtimer_start(&tcp_sk(sk)->pacing_timer, - ktime_add_ns(ktime_get(), len_ns), + ns_to_ktime(tcp_sk(sk)->tcp_wstamp_ns), HRTIMER_MODE_ABS_PINNED_SOFT); sock_hold(sk); } -static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb) { - skb->skb_mstamp = tp->tcp_mstamp; + struct tcp_sock *tp = tcp_sk(sk); + + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; + if (sk->sk_pacing_status != SK_PACING_NONE) { + u32 rate = sk->sk_pacing_rate; + + /* Original sch_fq does not pace first 10 MSS + * Note that tp->data_segs_out overflows after 2^32 packets, + * this is a minor annoyance. + */ + if (rate != ~0U && rate && tp->data_segs_out >= 10) { + tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate); + + tcp_internal_pacing(sk); + } + } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } @@ -1045,7 +1050,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(!skb)) return -ENOBUFS; } - skb->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); @@ -1137,7 +1142,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; - tcp_internal_pacing(sk, skb); } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) @@ -1149,8 +1153,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); - /* Our usage of tstamp should remain private */ - skb->tstamp = 0; + /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), @@ -1163,7 +1166,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, err = net_xmit_eval(err); } if (!err && oskb) { - tcp_update_skb_after_send(tp, oskb); + tcp_update_skb_after_send(sk, oskb); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1966,7 +1969,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); + age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head)); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) goto send_now; @@ -2312,7 +2315,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - tcp_update_skb_after_send(tp, skb); + tcp_update_skb_after_send(sk, skb); goto repair; /* Skip network transmission */ } @@ -2887,7 +2890,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) } tcp_skb_tsorted_restore(skb); if (!err) { - tcp_update_skb_after_send(tp, skb); + tcp_update_skb_after_send(sk, skb); tcp_rate_skb_sent(sk, skb); } } else { @@ -3205,10 +3208,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - skb->skb_mstamp = cookie_init_timestamp(req); + skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif - skb->skb_mstamp = tcp_clock_us(); + skb->skb_mstamp_ns = tcp_clock_ns(); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); @@ -3424,7 +3427,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - syn->skb_mstamp = syn_data->skb_mstamp; + syn->skb_mstamp_ns = syn_data->skb_mstamp_ns; /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 4dff40dad4dc..baed2186c7c6 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) * bandwidth estimate. */ if (!tp->packets_out) { - tp->first_tx_mstamp = skb->skb_mstamp; - tp->delivered_mstamp = skb->skb_mstamp; + u64 tstamp_us = tcp_skb_timestamp_us(skb); + + tp->first_tx_mstamp = tstamp_us; + tp->delivered_mstamp = tstamp_us; } TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; @@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tcp_skb_timestamp_us(skb); /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta( - skb->skb_mstamp, - scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); - /* Record send time of most recently ACKed packet: */ - tp->first_tx_mstamp = skb->skb_mstamp; } /* Mark off the skb delivered once it's sacked to avoid being * used again when it's cumulatively acked. For acked packets diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index c81aadff769b..fdb715bdd2d1 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) { return tp->rack.rtt_us + reo_wnd - - tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb)); } /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): @@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + if (!tcp_rack_sent_after(tp->rack.mstamp, + tcp_skb_timestamp_us(skb), tp->rack.end_seq, scb->end_seq)) break; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7fdf222a0bdf..61023d50cd60 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk) */ start_ts = tcp_skb_timestamp(skb); if (!start_ts) - skb->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f4e35b2ff8b8..1bec2203d558 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1042,7 +1042,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; @@ -1889,7 +1889,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void) { static_branch_enable(&udp_encap_needed_key); @@ -2124,6 +2124,28 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, inet_compute_pseudo); } +/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and + * return code conversion for ip layer consumption + */ +static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, + struct udphdr *uh) +{ + int ret; + + if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_compute_pseudo); + + ret = udp_queue_rcv_skb(sk, skb); + + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; +} + /* * All we need to do is get the socket, and then do a checksum. */ @@ -2170,14 +2192,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (unlikely(sk->sk_rx_dst != dst)) udp_sk_rx_dst_set(sk, dst); - ret = udp_queue_rcv_skb(sk, skb); + ret = udp_unicast_rcv_skb(sk, skb, uh); sock_put(sk); - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ - if (ret > 0) - return -ret; - return 0; + return ret; } if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) @@ -2185,22 +2202,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); - if (sk) { - int ret; - - if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - inet_compute_pseudo); - - ret = udp_queue_rcv_skb(sk, skb); - - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ - if (ret > 0) - return -ret; - return 0; - } + if (sk) + return udp_unicast_rcv_skb(sk, skb, uh); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 0c0522b79b43..802f2bc00d69 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -405,7 +405,7 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head, { struct udphdr *uh = udp_gro_udphdr(skb); - if (unlikely(!uh)) + if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index bcfc00e88756..f8de2482a529 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -67,6 +67,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 3d36644890bb..1ad2c2c4e250 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -46,7 +46,6 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); - struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -54,8 +53,7 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); - if (!xo || !(xo->flags & XFRM_GRO)) - skb_reset_transport_header(skb); + skb_reset_transport_header(skb); return 0; } |