diff options
-rw-r--r-- | include/net/ip_vs.h | 17 | ||||
-rw-r--r-- | include/uapi/linux/netfilter/nf_conntrack_common.h | 12 | ||||
-rw-r--r-- | include/uapi/linux/openvswitch.h | 49 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 30 | ||||
-rw-r--r-- | net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 30 | ||||
-rw-r--r-- | net/netfilter/ipset/ip_set_bitmap_ipmac.c | 2 | ||||
-rw-r--r-- | net/netfilter/ipset/ip_set_core.c | 3 | ||||
-rw-r--r-- | net/netfilter/ipset/ip_set_hash_mac.c | 3 | ||||
-rw-r--r-- | net/netfilter/ipset/ip_set_list_set.c | 55 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 38 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_pe_sip.c | 6 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_core.c | 6 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_acct.c | 3 | ||||
-rw-r--r-- | net/netfilter/nft_compat.c | 6 | ||||
-rw-r--r-- | net/netfilter/x_tables.c | 3 | ||||
-rw-r--r-- | net/openvswitch/Kconfig | 3 | ||||
-rw-r--r-- | net/openvswitch/conntrack.c | 660 | ||||
-rw-r--r-- | net/openvswitch/conntrack.h | 3 |
18 files changed, 795 insertions, 134 deletions
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0816c872b689..a6cc576fd467 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1588,6 +1588,23 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) } #endif /* CONFIG_IP_VS_NFCT */ +/* Really using conntrack? */ +static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp, + struct sk_buff *skb) +{ +#ifdef CONFIG_IP_VS_NFCT + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + return false; + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct)) + return true; +#endif + return false; +} + static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 319f47128db8..6d074d14ee27 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -20,9 +20,15 @@ enum ip_conntrack_info { IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY, IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY, - IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY, - /* Number of distinct IP_CT types (no NEW in reply dirn). */ - IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1 + /* No NEW in reply direction. */ + + /* Number of distinct IP_CT types. */ + IP_CT_NUMBER, + + /* only for userspace compatibility */ +#ifndef __KERNEL__ + IP_CT_NEW_REPLY = IP_CT_NUMBER, +#endif }; #define NF_CT_STATE_INVALID_BIT (1 << 0) diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a27222d5b413..616d04761730 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -454,6 +454,14 @@ struct ovs_key_ct_labels { #define OVS_CS_F_REPLY_DIR 0x08 /* Flow is in the reply direction. */ #define OVS_CS_F_INVALID 0x10 /* Could not track connection. */ #define OVS_CS_F_TRACKED 0x20 /* Conntrack has occurred. */ +#define OVS_CS_F_SRC_NAT 0x40 /* Packet's source address/port was + * mangled by NAT. + */ +#define OVS_CS_F_DST_NAT 0x80 /* Packet's destination address/port + * was mangled by NAT. + */ + +#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT) /** * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands. @@ -632,6 +640,8 @@ struct ovs_action_hash { * mask. For each bit set in the mask, the corresponding bit in the value is * copied to the connection tracking label field in the connection. * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG. + * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address + * translation (NAT) on the packet. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, @@ -641,12 +651,51 @@ enum ovs_ct_attr { OVS_CT_ATTR_LABELS, /* labels to associate with this connection. */ OVS_CT_ATTR_HELPER, /* netlink helper to assist detection of related connections. */ + OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */ __OVS_CT_ATTR_MAX }; #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1) /** + * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT. + * + * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port). + * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination + * address/port). Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be + * specified. Effective only for packets for ct_state NEW connections. + * Packets of committed connections are mangled by the NAT action according to + * the committed NAT type regardless of the flags specified. As a corollary, a + * NAT action without a NAT type flag will only mangle packets of committed + * connections. The following NAT attributes only apply for NEW + * (non-committed) connections, and they may be included only when the CT + * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or + * @OVS_NAT_ATTR_DST is also included. + * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr + * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr + * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port) + * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port) + * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots + * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5) + * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping + */ +enum ovs_nat_attr { + OVS_NAT_ATTR_UNSPEC, + OVS_NAT_ATTR_SRC, + OVS_NAT_ATTR_DST, + OVS_NAT_ATTR_IP_MIN, + OVS_NAT_ATTR_IP_MAX, + OVS_NAT_ATTR_PROTO_MIN, + OVS_NAT_ATTR_PROTO_MAX, + OVS_NAT_ATTR_PERSISTENT, + OVS_NAT_ATTR_PROTO_HASH, + OVS_NAT_ATTR_PROTO_RANDOM, + __OVS_NAT_ATTR_MAX, +}; + +#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1) + +/** * enum ovs_action_attr - Action types. * * @OVS_ACTION_ATTR_OUTPUT: Output packet to port. diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 61c7cc22ea68..f8aad03d674b 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -127,29 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { - const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt = skb_rtable(skb); - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt_flags & RTCF_LOCAL) && - (!skb->dev || skb->dev->features & - (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - ip_hdrlen(skb); - skb->csum_offset = (void *)check - data; - *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, proto, 0); - } else { - *check = 0; - *check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, proto, - csum_partial(data, datalen, - 0)); - if (proto == IPPROTO_UDP && !*check) - *check = CSUM_MANGLED_0; - } + const struct iphdr *iph = ip_hdr(skb); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + + ip_hdrlen(skb); + skb->csum_offset = (void *)check - data; + *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, + proto, 0); } else inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 6ce309928841..e0be97e636a4 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -131,29 +131,15 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { - const struct ipv6hdr *ipv6h = ipv6_hdr(skb); - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt6i_flags & RTF_LOCAL) && - (!skb->dev || skb->dev->features & - (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - (data - (void *)skb->data); - skb->csum_offset = (void *)check - data; - *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, - datalen, proto, 0); - } else { - *check = 0; - *check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, - datalen, proto, - csum_partial(data, datalen, - 0)); - if (proto == IPPROTO_UDP && !*check) - *check = CSUM_MANGLED_0; - } + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + + (data - (void *)skb->data); + skb->csum_offset = (void *)check - data; + *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + datalen, proto, 0); } else inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 29dde208381d..9a065f672d3a 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -267,6 +267,8 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], e.id = ip_to_id(map, ip); if (tb[IPSET_ATTR_ETHER]) { + if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN) + return -IPSET_ERR_PROTOCOL; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); e.add_mac = 1; } diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 95db43fc0303..7e6568cad494 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, if (unlikely(protocol_failed(attr))) return -IPSET_ERR_PROTOCOL; + /* Must wait for flush to be really finished in list:set */ + rcu_barrier(); + /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index f1e7d2c0f685..8f004edad396 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -110,7 +110,8 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - if (unlikely(!tb[IPSET_ATTR_ETHER])) + if (unlikely(!tb[IPSET_ATTR_ETHER] || + nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_extensions(set, tb, &ext); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index bbede95c9f68..24c6c1962aea 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -30,6 +30,7 @@ MODULE_ALIAS("ip_set_list:set"); struct set_elem { struct rcu_head rcu; struct list_head list; + struct ip_set *set; /* Sigh, in order to cleanup reference */ ip_set_id_t id; } __aligned(__alignof__(u64)); @@ -151,30 +152,29 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb, /* Userspace interfaces: we are protected by the nfnl mutex */ static void -__list_set_del(struct ip_set *set, struct set_elem *e) +__list_set_del_rcu(struct rcu_head * rcu) { + struct set_elem *e = container_of(rcu, struct set_elem, rcu); + struct ip_set *set = e->set; struct list_set *map = set->data; ip_set_put_byindex(map->net, e->id); - /* We may call it, because we don't have a to be destroyed - * extension which is used by the kernel. - */ ip_set_ext_destroy(set, e); - kfree_rcu(e, rcu); + kfree(e); } static inline void list_set_del(struct ip_set *set, struct set_elem *e) { list_del_rcu(&e->list); - __list_set_del(set, e); + call_rcu(&e->rcu, __list_set_del_rcu); } static inline void -list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) +list_set_replace(struct set_elem *e, struct set_elem *old) { list_replace_rcu(&old->list, &e->list); - __list_set_del(set, old); + call_rcu(&old->rcu, __list_set_del_rcu); } static void @@ -244,9 +244,6 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct set_elem *e, *n, *prev, *next; bool flag_exist = flags & IPSET_FLAG_EXIST; - if (SET_WITH_TIMEOUT(set)) - set_cleanup_entries(set); - /* Find where to add the new entry */ n = prev = next = NULL; list_for_each_entry(e, &map->members, list) { @@ -301,10 +298,11 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!e) return -ENOMEM; e->id = d->id; + e->set = set; INIT_LIST_HEAD(&e->list); list_set_init_extensions(set, ext, e); if (n) - list_set_replace(set, e, n); + list_set_replace(e, n); else if (next) list_add_tail_rcu(&e->list, &next->list); else if (prev) @@ -431,6 +429,7 @@ list_set_destroy(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); + list_for_each_entry_safe(e, n, &map->members, list) { list_del(&e->list); ip_set_put_byindex(map->net, e->id); @@ -450,8 +449,10 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) struct set_elem *e; u32 n = 0; - list_for_each_entry(e, &map->members, list) + rcu_read_lock(); + list_for_each_entry_rcu(e, &map->members, list) n++; + rcu_read_unlock(); nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) @@ -483,33 +484,25 @@ list_set_list(const struct ip_set *set, atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; - list_for_each_entry(e, &map->members, list) { - if (i == first) - break; - i++; - } rcu_read_lock(); - list_for_each_entry_from(e, &map->members, list) { - i++; - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + list_for_each_entry_rcu(e, &map->members, list) { + if (i < first || + (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set)))) { + i++; continue; + } nested = ipset_nest_start(skb, IPSET_ATTR_DATA); - if (!nested) { - if (i == first) { - nla_nest_cancel(skb, atd); - ret = -EMSGSIZE; - goto out; - } + if (!nested) goto nla_put_failure; - } if (nla_put_string(skb, IPSET_ATTR_NAME, ip_set_name_byindex(map->net, e->id))) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; ipset_nest_end(skb, nested); + i++; } ipset_nest_end(skb, atd); @@ -520,10 +513,12 @@ list_set_list(const struct ip_set *set, nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(i == first)) { + nla_nest_cancel(skb, atd); cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; + } else { + cb->args[IPSET_CB_ARG0] = i; } - cb->args[IPSET_CB_ARG0] = i - 1; ipset_nest_end(skb, atd); out: rcu_read_unlock(); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index f57b4dcdb233..b9a4082afa3a 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1089,6 +1089,7 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, switch (cp->protocol) { case IPPROTO_TCP: return (cp->state == IP_VS_TCP_S_TIME_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE) || ((conn_reuse_mode & 2) && (cp->state == IP_VS_TCP_S_FIN_WAIT) && (cp->flags & IP_VS_CONN_F_NOOUTPUT)); @@ -1757,15 +1758,34 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int cp = pp->conn_in_get(ipvs, af, skb, &iph); conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); - if (conn_reuse_mode && !iph.fragoffs && - is_new_conn(skb, &iph) && cp && - ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && - unlikely(!atomic_read(&cp->dest->weight))) || - unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) { - if (!atomic_read(&cp->n_control)) - ip_vs_conn_expire_now(cp); - __ip_vs_conn_put(cp); - cp = NULL; + if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { + bool uses_ct = false, resched = false; + + if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && + unlikely(!atomic_read(&cp->dest->weight))) { + resched = true; + uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + } else if (is_new_conn_expected(cp, conn_reuse_mode)) { + uses_ct = ip_vs_conn_uses_conntrack(cp, skb); + if (!atomic_read(&cp->n_control)) { + resched = true; + } else { + /* Do not reschedule controlling connection + * that uses conntrack while it is still + * referenced by controlled connection(s). + */ + resched = !uses_ct; + } + } + + if (resched) { + if (!atomic_read(&cp->n_control)) + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + if (uses_ct) + return NF_DROP; + cp = NULL; + } } if (unlikely(!cp)) { diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 1b8d594e493a..0a6eb5c0d9e9 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -70,10 +70,10 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) const char *dptr; int retc; - ip_vs_fill_iph_skb(p->af, skb, false, &iph); + retc = ip_vs_fill_iph_skb(p->af, skb, false, &iph); /* Only useful with UDP */ - if (iph.protocol != IPPROTO_UDP) + if (!retc || iph.protocol != IPPROTO_UDP) return -EINVAL; /* todo: IPv6 fragments: * I think this only should be done for the first fragment. /HS @@ -88,7 +88,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) dptr = skb->data + dataoff; datalen = skb->len - dataoff; - if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen)) + if (get_callid(dptr, 0, datalen, &matchoff, &matchlen)) return -EINVAL; /* N.B: pe_data is only set on success, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f60b4fdeeb8c..afde5f5e728a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -74,8 +74,7 @@ void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) spin_lock(lock); while (unlikely(nf_conntrack_locks_all)) { spin_unlock(lock); - spin_lock(&nf_conntrack_locks_all_lock); - spin_unlock(&nf_conntrack_locks_all_lock); + spin_unlock_wait(&nf_conntrack_locks_all_lock); spin_lock(lock); } } @@ -121,8 +120,7 @@ static void nf_conntrack_all_lock(void) nf_conntrack_locks_all = true; for (i = 0; i < CONNTRACK_LOCKS; i++) { - spin_lock(&nf_conntrack_locks[i]); - spin_unlock(&nf_conntrack_locks[i]); + spin_unlock_wait(&nf_conntrack_locks[i]); } } diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5274b04c42a6..4c2b4c0c4d5f 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -242,6 +242,9 @@ nfacct_filter_alloc(const struct nlattr * const attr) if (err < 0) return ERR_PTR(err); + if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE]) + return ERR_PTR(-EINVAL); + filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); if (!filter) return ERR_PTR(-ENOMEM); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 454841baa4d0..6228c422c766 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -660,6 +660,9 @@ nft_match_select_ops(const struct nft_ctx *ctx, if (IS_ERR(match)) return ERR_PTR(-ENOENT); + if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO])) + return ERR_PTR(-EINVAL); + /* This is the first time we use this match, allocate operations */ nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); if (nft_match == NULL) @@ -740,6 +743,9 @@ nft_target_select_ops(const struct nft_ctx *ctx, if (IS_ERR(target)) return ERR_PTR(-ENOENT); + if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) + return ERR_PTR(-EINVAL); + /* This is the first time we use this target, allocate operations */ nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL); if (nft_target == NULL) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d0cd2b9bf844..582c9cfd6567 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -659,6 +659,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) struct xt_table_info *info = NULL; size_t sz = sizeof(*info) + size; + if (sz < sizeof(*info)) + return NULL; + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index cd5fd9d728a7..234a73344c6e 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -6,7 +6,8 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET depends on !NF_CONNTRACK || \ - (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) + (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \ + (!NF_NAT || NF_NAT))) select LIBCRC32C select MPLS select NET_MPLS_GSO diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index ee6ff8ffc12d..dc5eb29fe7d6 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -13,21 +13,31 @@ #include <linux/module.h> #include <linux/openvswitch.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> #include <net/ip.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#ifdef CONFIG_NF_NAT_NEEDED +#include <linux/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#endif + #include "datapath.h" #include "conntrack.h" #include "flow.h" #include "flow_netlink.h" struct ovs_ct_len_tbl { - size_t maxlen; - size_t minlen; + int maxlen; + int minlen; }; /* Metadata mark for masked write to conntrack mark */ @@ -42,15 +52,25 @@ struct md_labels { struct ovs_key_ct_labels mask; }; +enum ovs_ct_nat { + OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */ + OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */ + OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */ +}; + /* Conntrack action context for execution. */ struct ovs_conntrack_info { struct nf_conntrack_helper *helper; struct nf_conntrack_zone zone; struct nf_conn *ct; u8 commit : 1; + u8 nat : 3; /* enum ovs_ct_nat */ u16 family; struct md_mark mark; struct md_labels labels; +#ifdef CONFIG_NF_NAT_NEEDED + struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */ +#endif }; static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); @@ -75,7 +95,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) switch (ctinfo) { case IP_CT_ESTABLISHED_REPLY: case IP_CT_RELATED_REPLY: - case IP_CT_NEW_REPLY: ct_state |= OVS_CS_F_REPLY_DIR; break; default: @@ -92,7 +111,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) ct_state |= OVS_CS_F_RELATED; break; case IP_CT_NEW: - case IP_CT_NEW_REPLY: ct_state |= OVS_CS_F_NEW; break; default: @@ -139,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, ovs_ct_get_labels(ct, &key->ct.labels); } -/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has - * previously sent the packet to conntrack via the ct action. +/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has + * previously sent the packet to conntrack via the ct action. If + * 'keep_nat_flags' is true, the existing NAT flags retained, else they are + * initialized from the connection status. */ static void ovs_ct_update_key(const struct sk_buff *skb, const struct ovs_conntrack_info *info, - struct sw_flow_key *key, bool post_ct) + struct sw_flow_key *key, bool post_ct, + bool keep_nat_flags) { const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; @@ -154,10 +175,22 @@ static void ovs_ct_update_key(const struct sk_buff *skb, ct = nf_ct_get(skb, &ctinfo); if (ct) { state = ovs_ct_get_state(ctinfo); + /* All unconfirmed entries are NEW connections. */ if (!nf_ct_is_confirmed(ct)) state |= OVS_CS_F_NEW; + /* OVS persists the related flag for the duration of the + * connection. + */ if (ct->master) state |= OVS_CS_F_RELATED; + if (keep_nat_flags) { + state |= key->ct.state & OVS_CS_F_NAT_MASK; + } else { + if (ct->status & IPS_SRC_NAT) + state |= OVS_CS_F_SRC_NAT; + if (ct->status & IPS_DST_NAT) + state |= OVS_CS_F_DST_NAT; + } zone = nf_ct_zone(ct); } else if (post_ct) { state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; @@ -167,9 +200,12 @@ static void ovs_ct_update_key(const struct sk_buff *skb, __ovs_ct_update_key(key, state, zone, ct); } +/* This is called to initialize CT key fields possibly coming in from the local + * stack. + */ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { - ovs_ct_update_key(skb, NULL, key, false); + ovs_ct_update_key(skb, NULL, key, false, false); } int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) @@ -201,7 +237,6 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, struct nf_conn *ct; u32 new_mark; - /* The connection could be invalid, in which case set_mark is no-op. */ ct = nf_ct_get(skb, &ctinfo); if (!ct) @@ -259,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) enum ip_conntrack_info ctinfo; unsigned int protoff; struct nf_conn *ct; + int err; ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) @@ -295,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) return NF_DROP; } - return helper->help(skb, protoff, ct, ctinfo); + err = helper->help(skb, protoff, ct, ctinfo); + if (err != NF_ACCEPT) + return err; + + /* Adjust seqs after helper. This is needed due to some helpers (e.g., + * FTP with NAT) adusting the TCP payload size when mangling IP + * addresses and/or port numbers in the text-based control connection. + */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) + return NF_DROP; + return NF_ACCEPT; } /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero @@ -352,14 +399,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, return __nf_ct_expect_find(net, zone, &tuple); } +/* This replicates logic from nf_conntrack_core.c that is not exported. */ +static enum ip_conntrack_info +ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) +{ + const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + return IP_CT_ESTABLISHED_REPLY; + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + return IP_CT_ESTABLISHED; + if (test_bit(IPS_EXPECTED_BIT, &ct->status)) + return IP_CT_RELATED; + return IP_CT_NEW; +} + +/* Find an existing connection which this packet belongs to without + * re-attributing statistics or modifying the connection state. This allows an + * skb->nfct lost due to an upcall to be recovered during actions execution. + * + * Must be called with rcu_read_lock. + * + * On success, populates skb->nfct and skb->nfctinfo, and returns the + * connection. Returns NULL if there is no existing entry. + */ +static struct nf_conn * +ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, + u8 l3num, struct sk_buff *skb) +{ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + unsigned int dataoff; + u8 protonum; + + l3proto = __nf_ct_l3proto_find(l3num); + if (!l3proto) { + pr_debug("ovs_ct_find_existing: Can't get l3proto\n"); + return NULL; + } + if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, + &protonum) <= 0) { + pr_debug("ovs_ct_find_existing: Can't get protonum\n"); + return NULL; + } + l4proto = __nf_ct_l4proto_find(l3num, protonum); + if (!l4proto) { + pr_debug("ovs_ct_find_existing: Can't get l4proto\n"); + return NULL; + } + if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, + protonum, net, &tuple, l3proto, l4proto)) { + pr_debug("ovs_ct_find_existing: Can't get tuple\n"); + return NULL; + } + + /* look for tuple match */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return NULL; /* Not found. */ + + ct = nf_ct_tuplehash_to_ctrack(h); + + ctinfo = ovs_ct_get_info(h); + if (ctinfo == IP_CT_NEW) { + /* This should not happen. */ + WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct); + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = ctinfo; + return ct; +} + /* Determine whether skb->nfct is equal to the result of conntrack lookup. */ -static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, - const struct ovs_conntrack_info *info) +static bool skb_nfct_cached(struct net *net, + const struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); + /* If no ct, check if we have evidence that an existing conntrack entry + * might be found for this skb. This happens when we lose a skb->nfct + * due to an upcall. If the connection was not confirmed, it is not + * cached and needs to be run through conntrack again. + */ + if (!ct && key->ct.state & OVS_CS_F_TRACKED && + !(key->ct.state & OVS_CS_F_INVALID) && + key->ct.zone == info->zone.id) + ct = ovs_ct_find_existing(net, &info->zone, info->family, skb); if (!ct) return false; if (!net_eq(net, read_pnet(&ct->ct_net))) @@ -377,6 +511,206 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, return true; } +#ifdef CONFIG_NF_NAT_NEEDED +/* Modelled after nf_nat_ipv[46]_fn(). + * range is only used for new, uninitialized NAT state. + * Returns either NF_ACCEPT or NF_DROP. + */ +static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype) +{ + int hooknum, nh_off, err = NF_ACCEPT; + + nh_off = skb_network_offset(skb); + skb_pull(skb, nh_off); + + /* See HOOK2MANIP(). */ + if (maniptype == NF_NAT_MANIP_SRC) + hooknum = NF_INET_LOCAL_IN; /* Source NAT */ + else + hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (skb->protocol == htons(ETH_P_IP) && + ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + hooknum)) + err = NF_DROP; + goto push; +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6)) { + __be16 frag_off; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int hdrlen = ipv6_skip_exthdr(skb, + sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, + ctinfo, + hooknum, + hdrlen)) + err = NF_DROP; + goto push; + } +#endif + } + /* Non-ICMP, fall thru to initialize if needed. */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + /* Initialize according to the NAT action. */ + err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) + /* Action is set up to establish a new + * mapping. + */ + ? nf_nat_setup_info(ct, range, maniptype) + : nf_nat_alloc_null_binding(ct, hooknum); + if (err != NF_ACCEPT) + goto push; + } + break; + + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + + default: + err = NF_DROP; + goto push; + } + + err = nf_nat_packet(ct, ctinfo, hooknum, skb); +push: + skb_push(skb, nh_off); + + return err; +} + +static void ovs_nat_update_key(struct sw_flow_key *key, + const struct sk_buff *skb, + enum nf_nat_manip_type maniptype) +{ + if (maniptype == NF_NAT_MANIP_SRC) { + __be16 src; + + key->ct.state |= OVS_CS_F_SRC_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.src = ip_hdr(skb)->saddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, + sizeof(key->ipv6.addr.src)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + src = udp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_TCP) + src = tcp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_SCTP) + src = sctp_hdr(skb)->source; + else + return; + + key->tp.src = src; + } else { + __be16 dst; + + key->ct.state |= OVS_CS_F_DST_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.dst = ip_hdr(skb)->daddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, + sizeof(key->ipv6.addr.dst)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + dst = udp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_TCP) + dst = tcp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_SCTP) + dst = sctp_hdr(skb)->dest; + else + return; + + key->tp.dst = dst; + } +} + +/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + enum nf_nat_manip_type maniptype; + int err; + + if (nf_ct_is_untracked(ct)) { + /* A NAT action may only be performed on tracked packets. */ + return NF_ACCEPT; + } + + /* Add NAT extension if not confirmed yet. */ + if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) + return NF_ACCEPT; /* Can't NAT. */ + + /* Determine NAT type. + * Check if the NAT type can be deduced from the tracked connection. + * Make sure expected traffic is NATted only when committing. + */ + if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && + ct->status & IPS_NAT_MASK && + (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) { + /* NAT an established or related connection like before. */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) + /* This is the REPLY direction for a connection + * for which NAT was applied in the forward + * direction. Do the reverse NAT. + */ + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; + else + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; + } else if (info->nat & OVS_CT_SRC_NAT) { + maniptype = NF_NAT_MANIP_SRC; + } else if (info->nat & OVS_CT_DST_NAT) { + maniptype = NF_NAT_MANIP_DST; + } else { + return NF_ACCEPT; /* Connection is not NATed. */ + } + err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); + + /* Mark NAT done if successful and update the flow key. */ + if (err == NF_ACCEPT) + ovs_nat_update_key(key, skb, maniptype); + + return err; +} +#else /* !CONFIG_NF_NAT_NEEDED */ +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + return NF_ACCEPT; +} +#endif + +/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if + * not done already. Update key with new CT state after passing the packet + * through conntrack. + * Note that if the packet is deemed invalid by conntrack, skb->nfct will be + * set to NULL and 0 will be returned. + */ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) @@ -386,8 +720,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * actually run the packet through conntrack twice unless it's for a * different zone. */ - if (!skb_nfct_cached(net, skb, info)) { + bool cached = skb_nfct_cached(net, key, info, skb); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (!cached) { struct nf_conn *tmpl = info->ct; + int err; /* Associate skb with specified zone. */ if (tmpl) { @@ -398,17 +737,53 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, skb->nfctinfo = IP_CT_NEW; } - if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, - skb) != NF_ACCEPT) + /* Repeat if requested, see nf_iterate(). */ + do { + err = nf_conntrack_in(net, info->family, + NF_INET_PRE_ROUTING, skb); + } while (err == NF_REPEAT); + + if (err != NF_ACCEPT) return -ENOENT; - if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { - WARN_ONCE(1, "helper rejected packet"); + /* Clear CT state NAT flags to mark that we have not yet done + * NAT after the nf_conntrack_in() call. We can actually clear + * the whole state, as it will be re-initialized below. + */ + key->ct.state = 0; + + /* Update the key, but keep the NAT flags. */ + ovs_ct_update_key(skb, info, key, true, true); + } + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + /* Packets starting a new connection must be NATted before the + * helper, so that the helper knows about the NAT. We enforce + * this by delaying both NAT and helper calls for unconfirmed + * connections until the committing CT action. For later + * packets NAT and Helper may be called in either order. + * + * NAT will be done only if the CT action has NAT, and only + * once per packet (per zone), as guarded by the NAT bits in + * the key->ct.state. + */ + if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) && + (nf_ct_is_confirmed(ct) || info->commit) && + ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { return -EINVAL; } - } - ovs_ct_update_key(skb, info, key, true); + /* Call the helper only if: + * - nf_conntrack_in() was executed above ("!cached") for a + * confirmed connection, or + * - When committing an unconfirmed connection. + */ + if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) && + ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + return -EINVAL; + } + } return 0; } @@ -420,19 +795,24 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, { struct nf_conntrack_expect *exp; + /* If we pass an expected packet through nf_conntrack_in() the + * expectation is typically removed, but the packet could still be + * lost in upcall processing. To prevent this from happening we + * perform an explicit expectation lookup. Expected connections are + * always new, and will be passed through conntrack only when they are + * committed, as it is OK to remove the expectation at that time. + */ exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); if (exp) { u8 state; + /* NOTE: New connections are NATted and Helped only when + * committed, so we are not calling into NAT here. + */ state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else { - int err; - - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; - } + } else + return __ovs_ct_lookup(net, key, info, skb); return 0; } @@ -442,21 +822,12 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { - u8 state; int err; - state = key->ct.state; - if (key->ct.zone == info->zone.id && - ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) { - /* Previous lookup has shown that this connection is already - * tracked and committed. Skip committing. - */ - return 0; - } - err = __ovs_ct_lookup(net, key, info, skb); if (err) return err; + /* This is a no-op if the connection has already been confirmed. */ if (nf_conntrack_confirm(skb) != NF_ACCEPT) return -EINVAL; @@ -541,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, return 0; } +#ifdef CONFIG_NF_NAT_NEEDED +static int parse_nat(const struct nlattr *attr, + struct ovs_conntrack_info *info, bool log) +{ + struct nlattr *a; + int rem; + bool have_ip_max = false; + bool have_proto_max = false; + bool ip_vers = (info->family == NFPROTO_IPV6); + + nla_for_each_nested(a, attr, rem) { + static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = { + [OVS_NAT_ATTR_SRC] = {0, 0}, + [OVS_NAT_ATTR_DST] = {0, 0}, + [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr), + sizeof(struct in6_addr)}, + [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr), + sizeof(struct in6_addr)}, + [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)}, + [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)}, + [OVS_NAT_ATTR_PERSISTENT] = {0, 0}, + [OVS_NAT_ATTR_PROTO_HASH] = {0, 0}, + [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0}, + }; + int type = nla_type(a); + + if (type > OVS_NAT_ATTR_MAX) { + OVS_NLERR(log, + "Unknown NAT attribute (type=%d, max=%d).\n", + type, OVS_NAT_ATTR_MAX); + return -EINVAL; + } + + if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) { + OVS_NLERR(log, + "NAT attribute type %d has unexpected length (%d != %d).\n", + type, nla_len(a), + ovs_nat_attr_lens[type][ip_vers]); + return -EINVAL; + } + + switch (type) { + case OVS_NAT_ATTR_SRC: + case OVS_NAT_ATTR_DST: + if (info->nat) { + OVS_NLERR(log, + "Only one type of NAT may be specified.\n" + ); + return -ERANGE; + } + info->nat |= OVS_CT_NAT; + info->nat |= ((type == OVS_NAT_ATTR_SRC) + ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT); + break; + + case OVS_NAT_ATTR_IP_MIN: + nla_memcpy(&info->range.min_addr, a, nla_len(a)); + info->range.flags |= NF_NAT_RANGE_MAP_IPS; + break; + + case OVS_NAT_ATTR_IP_MAX: + have_ip_max = true; + nla_memcpy(&info->range.max_addr, a, + sizeof(info->range.max_addr)); + info->range.flags |= NF_NAT_RANGE_MAP_IPS; + break; + + case OVS_NAT_ATTR_PROTO_MIN: + info->range.min_proto.all = htons(nla_get_u16(a)); + info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + break; + + case OVS_NAT_ATTR_PROTO_MAX: + have_proto_max = true; + info->range.max_proto.all = htons(nla_get_u16(a)); + info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + break; + + case OVS_NAT_ATTR_PERSISTENT: + info->range.flags |= NF_NAT_RANGE_PERSISTENT; + break; + + case OVS_NAT_ATTR_PROTO_HASH: + info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM; + break; + + case OVS_NAT_ATTR_PROTO_RANDOM: + info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY; + break; + + default: + OVS_NLERR(log, "Unknown nat attribute (%d).\n", type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem); + return -EINVAL; + } + if (!info->nat) { + /* Do not allow flags if no type is given. */ + if (info->range.flags) { + OVS_NLERR(log, + "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n" + ); + return -EINVAL; + } + info->nat = OVS_CT_NAT; /* NAT existing connections. */ + } else if (!info->commit) { + OVS_NLERR(log, + "NAT attributes may be specified only when CT COMMIT flag is also specified.\n" + ); + return -EINVAL; + } + /* Allow missing IP_MAX. */ + if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) { + memcpy(&info->range.max_addr, &info->range.min_addr, + sizeof(info->range.max_addr)); + } + /* Allow missing PROTO_MAX. */ + if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && + !have_proto_max) { + info->range.max_proto.all = info->range.min_proto.all; + } + return 0; +} +#endif + static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), @@ -550,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), .maxlen = sizeof(struct md_labels) }, [OVS_CT_ATTR_HELPER] = { .minlen = 1, - .maxlen = NF_CT_HELPER_NAME_LEN } + .maxlen = NF_CT_HELPER_NAME_LEN }, +#ifdef CONFIG_NF_NAT_NEEDED + /* NAT length is checked when parsing the nested attributes. */ + [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, +#endif }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, @@ -617,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, return -EINVAL; } break; +#ifdef CONFIG_NF_NAT_NEEDED + case OVS_CT_ATTR_NAT: { + int err = parse_nat(a, info, log); + + if (err) + return err; + break; + } +#endif default: OVS_NLERR(log, "Unknown conntrack attr (%d)", type); @@ -704,6 +1217,74 @@ err_free_ct: return err; } +#ifdef CONFIG_NF_NAT_NEEDED +static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_CT_ATTR_NAT); + if (!start) + return false; + + if (info->nat & OVS_CT_SRC_NAT) { + if (nla_put_flag(skb, OVS_NAT_ATTR_SRC)) + return false; + } else if (info->nat & OVS_CT_DST_NAT) { + if (nla_put_flag(skb, OVS_NAT_ATTR_DST)) + return false; + } else { + goto out; + } + + if (info->range.flags & NF_NAT_RANGE_MAP_IPS) { + if (info->family == NFPROTO_IPV4) { + if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN, + info->range.min_addr.ip) || + (info->range.max_addr.ip + != info->range.min_addr.ip && + (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX, + info->range.max_addr.ip)))) + return false; +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) + } else if (info->family == NFPROTO_IPV6) { + if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN, + &info->range.min_addr.in6) || + (memcmp(&info->range.max_addr.in6, + &info->range.min_addr.in6, + sizeof(info->range.max_addr.in6)) && + (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX, + &info->range.max_addr.in6)))) + return false; +#endif + } else { + return false; + } + } + if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && + (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN, + ntohs(info->range.min_proto.all)) || + (info->range.max_proto.all != info->range.min_proto.all && + nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX, + ntohs(info->range.max_proto.all))))) + return false; + + if (info->range.flags & NF_NAT_RANGE_PERSISTENT && + nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT)) + return false; + if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM && + nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH)) + return false; + if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY && + nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM)) + return false; +out: + nla_nest_end(skb, start); + + return true; +} +#endif + int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, struct sk_buff *skb) { @@ -732,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, ct_info->helper->name)) return -EMSGSIZE; } - +#ifdef CONFIG_NF_NAT_NEEDED + if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) + return -EMSGSIZE; +#endif nla_nest_end(skb, start); return 0; diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index a7544f405c16..8f6230bd6183 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a); #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ - OVS_CS_F_INVALID | OVS_CS_F_TRACKED) + OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \ + OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT) #else #include <linux/errno.h> |