diff options
-rw-r--r-- | Documentation/networking/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/networking/checksum-offloads.txt | 119 | ||||
-rw-r--r-- | drivers/net/vxlan.c | 6 | ||||
-rw-r--r-- | include/linux/skbuff.h | 26 | ||||
-rw-r--r-- | include/net/ip_tunnels.h | 3 | ||||
-rw-r--r-- | include/net/udp_tunnel.h | 2 | ||||
-rw-r--r-- | net/ipv4/fou.c | 14 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 17 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 22 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 28 | ||||
-rw-r--r-- | net/ipv6/ip6_checksum.c | 23 | ||||
-rw-r--r-- | net/ipv6/sit.c | 4 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_xmit.c | 6 |
14 files changed, 197 insertions, 77 deletions
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index df27a1a50776..415154a487d0 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX @@ -44,6 +44,8 @@ can.txt - documentation on CAN protocol family. cdc_mbim.txt - 3G/LTE USB modem (Mobile Broadband Interface Model) +checksum-offloads.txt + - Explanation of checksum offloads; LCO, RCO cops.txt - info on the COPS LocalTalk Linux driver cs89x0.txt diff --git a/Documentation/networking/checksum-offloads.txt b/Documentation/networking/checksum-offloads.txt new file mode 100644 index 000000000000..de2a327766a7 --- /dev/null +++ b/Documentation/networking/checksum-offloads.txt @@ -0,0 +1,119 @@ +Checksum Offloads in the Linux Networking Stack + + +Introduction +============ + +This document describes a set of techniques in the Linux networking stack + to take advantage of checksum offload capabilities of various NICs. + +The following technologies are described: + * TX Checksum Offload + * LCO: Local Checksum Offload + * RCO: Remote Checksum Offload + +Things that should be documented here but aren't yet: + * RX Checksum Offload + * CHECKSUM_UNNECESSARY conversion + + +TX Checksum Offload +=================== + +The interface for offloading a transmit checksum to a device is explained + in detail in comments near the top of include/linux/skbuff.h. +In brief, it allows to request the device fill in a single ones-complement + checksum defined by the sk_buff fields skb->csum_start and + skb->csum_offset. The device should compute the 16-bit ones-complement + checksum (i.e. the 'IP-style' checksum) from csum_start to the end of the + packet, and fill in the result at (csum_start + csum_offset). +Because csum_offset cannot be negative, this ensures that the previous + value of the checksum field is included in the checksum computation, thus + it can be used to supply any needed corrections to the checksum (such as + the sum of the pseudo-header for UDP or TCP). +This interface only allows a single checksum to be offloaded. Where + encapsulation is used, the packet may have multiple checksum fields in + different header layers, and the rest will have to be handled by another + mechanism such as LCO or RCO. +No offloading of the IP header checksum is performed; it is always done in + software. This is OK because when we build the IP header, we obviously + have it in cache, so summing it isn't expensive. It's also rather short. +The requirements for GSO are more complicated, because when segmenting an + encapsulated packet both the inner and outer checksums may need to be + edited or recomputed for each resulting segment. See the skbuff.h comment + (section 'E') for more details. + +A driver declares its offload capabilities in netdev->hw_features; see + Documentation/networking/netdev-features for more. Note that a device + which only advertises NETIF_F_IP[V6]_CSUM must still obey the csum_start + and csum_offset given in the SKB; if it tries to deduce these itself in + hardware (as some NICs do) the driver should check that the values in the + SKB match those which the hardware will deduce, and if not, fall back to + checksumming in software instead (with skb_checksum_help or one of the + skb_csum_off_chk* functions as mentioned in include/linux/skbuff.h). This + is a pain, but that's what you get when hardware tries to be clever. + +The stack should, for the most part, assume that checksum offload is + supported by the underlying device. The only place that should check is + validate_xmit_skb(), and the functions it calls directly or indirectly. + That function compares the offload features requested by the SKB (which + may include other offloads besides TX Checksum Offload) and, if they are + not supported or enabled on the device (determined by netdev->features), + performs the corresponding offload in software. In the case of TX + Checksum Offload, that means calling skb_checksum_help(skb). + + +LCO: Local Checksum Offload +=========================== + +LCO is a technique for efficiently computing the outer checksum of an + encapsulated datagram when the inner checksum is due to be offloaded. +The ones-complement sum of a correctly checksummed TCP or UDP packet is + equal to the sum of the pseudo header, because everything else gets + 'cancelled out' by the checksum field. This is because the sum was + complemented before being written to the checksum field. +More generally, this holds in any case where the 'IP-style' ones complement + checksum is used, and thus any checksum that TX Checksum Offload supports. +That is, if we have set up TX Checksum Offload with a start/offset pair, we + know that _after the device has filled in that checksum_, the ones + complement sum from csum_start to the end of the packet will be equal to + _whatever value we put in the checksum field beforehand_. This allows us + to compute the outer checksum without looking at the payload: we simply + stop summing when we get to csum_start, then add the 16-bit word at + (csum_start + csum_offset). +Then, when the true inner checksum is filled in (either by hardware or by + skb_checksum_help()), the outer checksum will become correct by virtue of + the arithmetic. + +LCO is performed by the stack when constructing an outer UDP header for an + encapsulation such as VXLAN or GENEVE, in udp_set_csum(). Similarly for + the IPv6 equivalents, in udp6_set_csum(). +It is also performed when constructing an IPv4 GRE header, in + net/ipv4/ip_gre.c:build_header(). It is *not* currently performed when + constructing an IPv6 GRE header; the GRE checksum is computed over the + whole packet in net/ipv6/ip6_gre.c:ip6gre_xmit2(), but it should be + possible to use LCO here as IPv6 GRE still uses an IP-style checksum. +All of the LCO implementations use a helper function lco_csum(), in + include/linux/skbuff.h. + +LCO can safely be used for nested encapsulations; in this case, the outer + encapsulation layer will sum over both its own header and the 'middle' + header. This does mean that the 'middle' header will get summed multiple + times, but there doesn't seem to be a way to avoid that without incurring + bigger costs (e.g. in SKB bloat). + + +RCO: Remote Checksum Offload +============================ + +RCO is a technique for eliding the inner checksum of an encapsulated + datagram, allowing the outer checksum to be offloaded. It does, however, + involve a change to the encapsulation protocols, which the receiver must + also support. For this reason, it is disabled by default. +RCO is detailed in the following Internet-Drafts: +https://tools.ietf.org/html/draft-herbert-remotecsumoffload-00 +https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 +In Linux, RCO is implemented individually in each encapsulation protocol, + and most tunnel types have flags controlling its use. For instance, VXLAN + has the flag VXLAN_F_REMCSUM_TX (per struct vxlan_rdst) to indicate that + RCO should be used when transmitting to a given remote destination. diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ebf57d90d295..0a23c64379d6 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1702,10 +1702,8 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, if (csum_start <= VXLAN_MAX_REMCSUM_START && !(csum_start & VXLAN_RCO_SHIFT_MASK) && (skb->csum_offset == offsetof(struct udphdr, check) || - skb->csum_offset == offsetof(struct tcphdr, check))) { - udp_sum = false; + skb->csum_offset == offsetof(struct tcphdr, check))) type |= SKB_GSO_TUNNEL_REMCSUM; - } } min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len @@ -1723,7 +1721,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, if (WARN_ON(!skb)) return -ENOMEM; - skb = iptunnel_handle_offloads(skb, udp_sum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6ec86f1a2ed9..39206751463e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3702,5 +3702,31 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) return hdr_len + skb_gso_transport_seglen(skb); } +/* Local Checksum Offload. + * Compute outer checksum based on the assumption that the + * inner checksum will be offloaded later. + * See Documentation/networking/checksum-offloads.txt for + * explanation of how this works. + * Fill in outer checksum adjustment (e.g. with sum of outer + * pseudo-header) before calling. + * Also ensure that inner checksum is in linear data area. + */ +static inline __wsum lco_csum(struct sk_buff *skb) +{ + char *inner_csum_field; + __wsum csum; + + /* Start with complement of inner checksum adjustment */ + inner_csum_field = skb->data + skb_checksum_start_offset(skb) + + skb->csum_offset; + csum = ~csum_unfold(*(__force __sum16 *)inner_csum_field); + /* Add in checksum of our headers (incl. outer checksum + * adjustment filled in by caller) + */ + csum = skb_checksum(skb, 0, skb_checksum_start_offset(skb), csum); + /* The result is the checksum from skb->data to end of packet */ + return csum; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6db96ea0144f..bc439f32baa9 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -279,8 +279,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); -struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, - int gso_type_mask); +struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index cca2ad3082c3..97f5adb121a6 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -103,7 +103,7 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, { int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - return iptunnel_handle_offloads(skb, udp_csum, type); + return iptunnel_handle_offloads(skb, type); } static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 976f0dcf6991..88dab0c1670c 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -774,7 +774,6 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); - uh->check = 0; udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, fl4->saddr, fl4->daddr, skb->len); @@ -784,11 +783,11 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; __be16 sport; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -804,8 +803,8 @@ EXPORT_SYMBOL(fou_build_header); int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; size_t hdrlen, optlen = 0; __be16 sport; @@ -814,7 +813,6 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) { - csum = false; optlen += GUE_PLEN_REMCSUM; type |= SKB_GSO_TUNNEL_REMCSUM; need_priv = true; @@ -822,7 +820,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, optlen += need_priv ? GUE_LEN_PRIV : 0; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7c51c4e1661f..65748db44285 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -440,6 +440,17 @@ drop: return 0; } +static __sum16 gre_checksum(struct sk_buff *skb) +{ + __wsum csum; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + csum = lco_csum(skb); + else + csum = skb_checksum(skb, 0, skb->len, 0); + return csum_fold(csum); +} + static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) { @@ -467,8 +478,7 @@ static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); + *(__sum16 *)ptr = gre_checksum(skb); } } } @@ -493,8 +503,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool csum) { - return iptunnel_handle_offloads(skb, csum, - csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); + return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } static struct rtable *gre_get_rt(struct sk_buff *skb, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 859d415c0b2d..a6e58b6141cd 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -148,7 +148,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, int gso_type_mask) { int err; @@ -166,20 +165,15 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, return skb; } - /* If packet is not gso and we are resolving any partial checksum, - * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL - * on the outer header without confusing devices that implement - * NETIF_F_IP_CSUM with encapsulation. - */ - if (csum_help) - skb->encapsulation = 0; - - if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_NONE; + /* We clear encapsulation here to prevent badly-written + * drivers potentially deciding to offload an inner checksum + * if we set CHECKSUM_PARTIAL on the outer header. + * This should go away when the drivers are all fixed. + */ + skb->encapsulation = 0; + } return skb; error: diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4044da61e747..6ec5b42fd172 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); if (IS_ERR(skb)) goto out; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ac3cedb25a9f..9fc4e9c06aae 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -848,32 +848,20 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, { struct udphdr *uh = udp_hdr(skb); - if (nocheck) + if (nocheck) { uh->check = 0; - else if (skb_is_gso(skb)) + } else if (skb_is_gso(skb)) { uh->check = ~udp_v4_check(len, saddr, daddr, 0); - else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & - (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + uh->check = 0; + uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v4_check(len, saddr, daddr, 0); - } else { - __wsum csum; - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - - uh->check = 0; - csum = skb_checksum(skb, 0, len, 0); - uh->check = udp_v4_check(len, saddr, daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; } } EXPORT_SYMBOL(udp_set_csum); diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 9a4d7322fb22..8f920580976f 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -98,27 +98,16 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb, uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); - else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - + else if (skb->ip_summed == CHECKSUM_PARTIAL) { + uh->check = 0; + uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); - } else { - __wsum csum; - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - - uh->check = 0; - csum = skb_checksum(skb, 0, len, 0); - uh->check = udp_v6_check(len, saddr, daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; } } EXPORT_SYMBOL(udp6_set_csum); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2066d1c25a11..9a6b407f5840 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -911,7 +911,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto tx_error; } - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); + skb = iptunnel_handle_offloads(skb, SKB_GSO_SIT); if (IS_ERR(skb)) { ip_rt_put(rt); goto out; @@ -1000,7 +1000,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); if (IS_ERR(skb)) goto out; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 3264cb49b333..a3f5cd9b3c4c 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1019,8 +1019,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads( - skb, false, __tun_gso_type_mask(AF_INET, cp->af)); + skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)); if (IS_ERR(skb)) goto tx_error; @@ -1112,8 +1111,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads( - skb, false, __tun_gso_type_mask(AF_INET6, cp->af)); + skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)); if (IS_ERR(skb)) goto tx_error; |