diff options
author | David S. Miller <davem@davemloft.net> | 2018-12-18 08:49:48 -0800 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-12-18 08:49:48 -0800 |
commit | 77c7a7b3e7da2c97c182c7e57a16d16c6b555c24 (patch) | |
tree | 6bde171b004d5cee63dd161dc9a66e16a48f7eae | |
parent | b12c97d45cd061862b79fe02aa41a73d685ec7a1 (diff) | |
parent | 77990464bb39eb0f5cd41e4f9e3d6411f2883cac (diff) |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next
Steffen Klassert says:
====================
pull request (net-next): ipsec-next 2018-12-18
1) Add xfrm policy selftest scripts.
From Florian Westphal.
2) Split inexact policies into four different search list
classes and use the rbtree infrastructure to store/lookup
the policies. This is to improve the policy lookup
performance after the flowcache removal.
Patches from Florian Westphal.
3) Various coding style fixes, from Colin Ian King.
4) Fix policy lookup logic after adding the inexact policy
search tree infrastructure. From Florian Westphal.
5) Remove a useless remove BUG_ON from xfrm6_dst_ifdown.
From Li RongQing.
6) Use the correct policy direction for lookups on hash
rebuilding. From Florian Westphal.
Please pull or let me know if there are problems.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/netns/xfrm.h | 2 | ||||
-rw-r--r-- | include/net/xfrm.h | 5 | ||||
-rw-r--r-- | net/ipv6/xfrm6_policy.c | 1 | ||||
-rw-r--r-- | net/key/af_key.c | 2 | ||||
-rw-r--r-- | net/xfrm/xfrm_policy.c | 1248 | ||||
-rw-r--r-- | tools/testing/selftests/net/Makefile | 3 | ||||
-rwxr-xr-x | tools/testing/selftests/net/xfrm_policy.sh | 302 |
7 files changed, 1428 insertions, 135 deletions
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 9991e5ef52cc..59f45b1e9dac 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -5,6 +5,7 @@ #include <linux/list.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/rhashtable-types.h> #include <linux/xfrm.h> #include <net/dst_ops.h> @@ -53,6 +54,7 @@ struct netns_xfrm { unsigned int policy_count[XFRM_POLICY_MAX * 2]; struct work_struct policy_hash_work; struct xfrm_policy_hthresh policy_hthresh; + struct list_head inexact_bins; struct sock *nlsk; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0eb390c205af..0a8d70d16918 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -577,6 +577,7 @@ struct xfrm_policy { /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; + u32 pos; struct timer_list timer; atomic_t genid; @@ -589,6 +590,7 @@ struct xfrm_policy { struct xfrm_lifetime_cur curlft; struct xfrm_policy_walk_entry walk; struct xfrm_policy_queue polq; + bool bydst_reinsert; u8 type; u8 action; u8 flags; @@ -596,6 +598,7 @@ struct xfrm_policy { u16 family; struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; + struct hlist_node bydst_inexact_list; struct rcu_head rcu; }; @@ -1967,7 +1970,7 @@ static inline void xfrm_dev_state_delete(struct xfrm_state *x) static inline void xfrm_dev_state_free(struct xfrm_state *x) { struct xfrm_state_offload *xso = &x->xso; - struct net_device *dev = xso->dev; + struct net_device *dev = xso->dev; if (dev && dev->xfrmdev_ops) { if (dev->xfrmdev_ops->xdo_dev_state_free) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index d35bcf92969c..769f8f78d3b8 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -262,7 +262,6 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = in6_dev_get(dev_net(dev)->loopback_dev); - BUG_ON(!loopback_idev); do { in6_dev_put(xdst->u.rt6.rt6i_idev); diff --git a/net/key/af_key.c b/net/key/af_key.c index 9d61266526e7..655c787f9d54 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2020,7 +2020,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp) { - struct xfrm_sec_ctx *xfrm_ctx = xp->security; + struct xfrm_sec_ctx *xfrm_ctx = xp->security; if (xfrm_ctx) { int len = sizeof(struct sadb_x_sec_ctx); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 119a427d9b2b..be04091eb7db 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -26,6 +26,7 @@ #include <linux/cache.h> #include <linux/cpu.h> #include <linux/audit.h> +#include <linux/rhashtable.h> #include <net/dst.h> #include <net/flow.h> #include <net/xfrm.h> @@ -45,6 +46,99 @@ struct xfrm_flo { u8 flags; }; +/* prefixes smaller than this are stored in lists, not trees. */ +#define INEXACT_PREFIXLEN_IPV4 16 +#define INEXACT_PREFIXLEN_IPV6 48 + +struct xfrm_pol_inexact_node { + struct rb_node node; + union { + xfrm_address_t addr; + struct rcu_head rcu; + }; + u8 prefixlen; + + struct rb_root root; + + /* the policies matching this node, can be empty list */ + struct hlist_head hhead; +}; + +/* xfrm inexact policy search tree: + * xfrm_pol_inexact_bin = hash(dir,type,family,if_id); + * | + * +---- root_d: sorted by daddr:prefix + * | | + * | xfrm_pol_inexact_node + * | | + * | +- root: sorted by saddr/prefix + * | | | + * | | xfrm_pol_inexact_node + * | | | + * | | + root: unused + * | | | + * | | + hhead: saddr:daddr policies + * | | + * | +- coarse policies and all any:daddr policies + * | + * +---- root_s: sorted by saddr:prefix + * | | + * | xfrm_pol_inexact_node + * | | + * | + root: unused + * | | + * | + hhead: saddr:any policies + * | + * +---- coarse policies and all any:any policies + * + * Lookups return four candidate lists: + * 1. any:any list from top-level xfrm_pol_inexact_bin + * 2. any:daddr list from daddr tree + * 3. saddr:daddr list from 2nd level daddr tree + * 4. saddr:any list from saddr tree + * + * This result set then needs to be searched for the policy with + * the lowest priority. If two results have same prio, youngest one wins. + */ + +struct xfrm_pol_inexact_key { + possible_net_t net; + u32 if_id; + u16 family; + u8 dir, type; +}; + +struct xfrm_pol_inexact_bin { + struct xfrm_pol_inexact_key k; + struct rhash_head head; + /* list containing '*:*' policies */ + struct hlist_head hhead; + + seqcount_t count; + /* tree sorted by daddr/prefix */ + struct rb_root root_d; + + /* tree sorted by saddr/prefix */ + struct rb_root root_s; + + /* slow path below */ + struct list_head inexact_bins; + struct rcu_head rcu; +}; + +enum xfrm_pol_inexact_candidate_type { + XFRM_POL_CAND_BOTH, + XFRM_POL_CAND_SADDR, + XFRM_POL_CAND_DADDR, + XFRM_POL_CAND_ANY, + + XFRM_POL_CAND_MAX, +}; + +struct xfrm_pol_inexact_candidates { + struct hlist_head *res[XFRM_POL_CAND_MAX]; +}; + static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; @@ -55,6 +149,9 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] static struct kmem_cache *xfrm_dst_cache __ro_after_init; static __read_mostly seqcount_t xfrm_policy_hash_generation; +static struct rhashtable xfrm_policy_inexact_table; +static const struct rhashtable_params xfrm_pol_inexact_params; + static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); @@ -64,6 +161,25 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, + u32 if_id); + +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup_rcu(struct net *net, + u8 type, u16 family, u8 dir, u32 if_id); +static struct xfrm_policy * +xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, + bool excl); +static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, + struct xfrm_policy *policy); + +static bool +xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, + struct xfrm_pol_inexact_bin *b, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr); + static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); @@ -269,6 +385,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); + INIT_HLIST_NODE(&policy->bydst_inexact_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); @@ -365,7 +482,7 @@ static struct hlist_head *policy_hash_bysel(struct net *net, hash = __sel_hash(sel, family, hmask, dbits, sbits); if (hash == hmask + 1) - return &net->xfrm.policy_inexact[dir]; + return NULL; return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; @@ -563,6 +680,533 @@ static void xfrm_hash_resize(struct work_struct *work) mutex_unlock(&hash_resize_mutex); } +static void xfrm_hash_reset_inexact_table(struct net *net) +{ + struct xfrm_pol_inexact_bin *b; + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins) + INIT_HLIST_HEAD(&b->hhead); +} + +/* Make sure *pol can be inserted into fastbin. + * Useful to check that later insert requests will be sucessful + * (provided xfrm_policy_lock is held throughout). + */ +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) +{ + struct xfrm_pol_inexact_bin *bin, *prev; + struct xfrm_pol_inexact_key k = { + .family = pol->family, + .type = pol->type, + .dir = dir, + .if_id = pol->if_id, + }; + struct net *net = xp_net(pol); + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + write_pnet(&k.net, net); + bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k, + xfrm_pol_inexact_params); + if (bin) + return bin; + + bin = kzalloc(sizeof(*bin), GFP_ATOMIC); + if (!bin) + return NULL; + + bin->k = k; + INIT_HLIST_HEAD(&bin->hhead); + bin->root_d = RB_ROOT; + bin->root_s = RB_ROOT; + seqcount_init(&bin->count); + + prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, + &bin->k, &bin->head, + xfrm_pol_inexact_params); + if (!prev) { + list_add(&bin->inexact_bins, &net->xfrm.inexact_bins); + return bin; + } + + kfree(bin); + + return IS_ERR(prev) ? NULL : prev; +} + +static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr, + int family, u8 prefixlen) +{ + if (xfrm_addr_any(addr, family)) + return true; + + if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6) + return true; + + if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4) + return true; + + return false; +} + +static bool +xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy) +{ + const xfrm_address_t *addr; + bool saddr_any, daddr_any; + u8 prefixlen; + + addr = &policy->selector.saddr; + prefixlen = policy->selector.prefixlen_s; + + saddr_any = xfrm_pol_inexact_addr_use_any_list(addr, + policy->family, + prefixlen); + addr = &policy->selector.daddr; + prefixlen = policy->selector.prefixlen_d; + daddr_any = xfrm_pol_inexact_addr_use_any_list(addr, + policy->family, + prefixlen); + return saddr_any && daddr_any; +} + +static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node, + const xfrm_address_t *addr, u8 prefixlen) +{ + node->addr = *addr; + node->prefixlen = prefixlen; +} + +static struct xfrm_pol_inexact_node * +xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen) +{ + struct xfrm_pol_inexact_node *node; + + node = kzalloc(sizeof(*node), GFP_ATOMIC); + if (node) + xfrm_pol_inexact_node_init(node, addr, prefixlen); + + return node; +} + +static int xfrm_policy_addr_delta(const xfrm_address_t *a, + const xfrm_address_t *b, + u8 prefixlen, u16 family) +{ + unsigned int pdw, pbi; + int delta = 0; + + switch (family) { + case AF_INET: + if (sizeof(long) == 4 && prefixlen == 0) + return ntohl(a->a4) - ntohl(b->a4); + return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) - + (ntohl(b->a4) & ((~0UL << (32 - prefixlen)))); + case AF_INET6: + pdw = prefixlen >> 5; + pbi = prefixlen & 0x1f; + + if (pdw) { + delta = memcmp(a->a6, b->a6, pdw << 2); + if (delta) + return delta; + } + if (pbi) { + u32 mask = ~0u << (32 - pbi); + + delta = (ntohl(a->a6[pdw]) & mask) - + (ntohl(b->a6[pdw]) & mask); + } + break; + default: + break; + } + + return delta; +} + +static void xfrm_policy_inexact_list_reinsert(struct net *net, + struct xfrm_pol_inexact_node *n, + u16 family) +{ + unsigned int matched_s, matched_d; + struct hlist_node *newpos = NULL; + struct xfrm_policy *policy, *p; + + matched_s = 0; + matched_d = 0; + + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + bool matches_s, matches_d; + + if (!policy->bydst_reinsert) + continue; + + WARN_ON_ONCE(policy->family != family); + + policy->bydst_reinsert = false; + hlist_for_each_entry(p, &n->hhead, bydst) { + if (policy->priority >= p->priority) + newpos = &p->bydst; + else + break; + } + + if (newpos) + hlist_add_behind(&policy->bydst, newpos); + else + hlist_add_head(&policy->bydst, &n->hhead); + + /* paranoia checks follow. + * Check that the reinserted policy matches at least + * saddr or daddr for current node prefix. + * + * Matching both is fine, matching saddr in one policy + * (but not daddr) and then matching only daddr in another + * is a bug. + */ + matches_s = xfrm_policy_addr_delta(&policy->selector.saddr, + &n->addr, + n->prefixlen, + family) == 0; + matches_d = xfrm_policy_addr_delta(&policy->selector.daddr, + &n->addr, + n->prefixlen, + family) == 0; + if (matches_s && matches_d) + continue; + + WARN_ON_ONCE(!matches_s && !matches_d); + if (matches_s) + matched_s++; + if (matches_d) + matched_d++; + WARN_ON_ONCE(matched_s && matched_d); + } +} + +static void xfrm_policy_inexact_node_reinsert(struct net *net, + struct xfrm_pol_inexact_node *n, + struct rb_root *new, + u16 family) +{ + struct rb_node **p, *parent = NULL; + struct xfrm_pol_inexact_node *node; + + /* we should not have another subtree here */ + WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); + + p = &new->rb_node; + while (*p) { + u8 prefixlen; + int delta; + + parent = *p; + node = rb_entry(*p, struct xfrm_pol_inexact_node, node); + + prefixlen = min(node->prefixlen, n->prefixlen); + + delta = xfrm_policy_addr_delta(&n->addr, &node->addr, + prefixlen, family); + if (delta < 0) { + p = &parent->rb_left; + } else if (delta > 0) { + p = &parent->rb_right; + } else { + struct xfrm_policy *tmp; + + hlist_for_each_entry(tmp, &node->hhead, bydst) + tmp->bydst_reinsert = true; + hlist_for_each_entry(tmp, &n->hhead, bydst) + tmp->bydst_reinsert = true; + + INIT_HLIST_HEAD(&node->hhead); + xfrm_policy_inexact_list_reinsert(net, node, family); + + if (node->prefixlen == n->prefixlen) { + kfree_rcu(n, rcu); + return; + } + + rb_erase(*p, new); + kfree_rcu(n, rcu); + n = node; + n->prefixlen = prefixlen; + *p = new->rb_node; + parent = NULL; + } + } + + rb_link_node_rcu(&n->node, parent, p); + rb_insert_color(&n->node, new); +} + +/* merge nodes v and n */ +static void xfrm_policy_inexact_node_merge(struct net *net, + struct xfrm_pol_inexact_node *v, + struct xfrm_pol_inexact_node *n, + u16 family) +{ + struct xfrm_pol_inexact_node *node; + struct xfrm_policy *tmp; + struct rb_node *rnode; + + /* To-be-merged node v has a subtree. + * + * Dismantle it and insert its nodes to n->root. + */ + while ((rnode = rb_first(&v->root)) != NULL) { + node = rb_entry(rnode, struct xfrm_pol_inexact_node, node); + rb_erase(&node->node, &v->root); + xfrm_policy_inexact_node_reinsert(net, node, &n->root, + family); + } + + hlist_for_each_entry(tmp, &v->hhead, bydst) + tmp->bydst_reinsert = true; + hlist_for_each_entry(tmp, &n->hhead, bydst) + tmp->bydst_reinsert = true; + + INIT_HLIST_HEAD(&n->hhead); + xfrm_policy_inexact_list_reinsert(net, n, family); +} + +static struct xfrm_pol_inexact_node * +xfrm_policy_inexact_insert_node(struct net *net, + struct rb_root *root, + xfrm_address_t *addr, + u16 family, u8 prefixlen, u8 dir) +{ + struct xfrm_pol_inexact_node *cached = NULL; + struct rb_node **p, *parent = NULL; + struct xfrm_pol_inexact_node *node; + + p = &root->rb_node; + while (*p) { + int delta; + + parent = *p; + node = rb_entry(*p, struct xfrm_pol_inexact_node, node); + + delta = xfrm_policy_addr_delta(addr, &node->addr, + node->prefixlen, + family); + if (delta == 0 && prefixlen >= node->prefixlen) { + WARN_ON_ONCE(cached); /* ipsec policies got lost */ + return node; + } + + if (delta < 0) + p = &parent->rb_left; + else + p = &parent->rb_right; + + if (prefixlen < node->prefixlen) { + delta = xfrm_policy_addr_delta(addr, &node->addr, + prefixlen, + family); + if (delta) + continue; + + /* This node is a subnet of the new prefix. It needs + * to be removed and re-inserted with the smaller + * prefix and all nodes that are now also covered + * by the reduced prefixlen. + */ + rb_erase(&node->node, root); + + if (!cached) { + xfrm_pol_inexact_node_init(node, addr, + prefixlen); + cached = node; + } else { + /* This node also falls within the new + * prefixlen. Merge the to-be-reinserted + * node and this one. + */ + xfrm_policy_inexact_node_merge(net, node, + cached, family); + kfree_rcu(node, rcu); + } + + /* restart */ + p = &root->rb_node; + parent = NULL; + } + } + + node = cached; + if (!node) { + node = xfrm_pol_inexact_node_alloc(addr, prefixlen); + if (!node) + return NULL; + } + + rb_link_node_rcu(&node->node, parent, p); + rb_insert_color(&node->node, root); + + return node; +} + +static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm) +{ + struct xfrm_pol_inexact_node *node; + struct rb_node *rn = rb_first(r); + + while (rn) { + node = rb_entry(rn, struct xfrm_pol_inexact_node, node); + + xfrm_policy_inexact_gc_tree(&node->root, rm); + rn = rb_next(rn); + + if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) { + WARN_ON_ONCE(rm); + continue; + } + + rb_erase(&node->node, r); + kfree_rcu(node, rcu); + } +} + +static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit) +{ + write_seqcount_begin(&b->count); + xfrm_policy_inexact_gc_tree(&b->root_d, net_exit); + xfrm_policy_inexact_gc_tree(&b->root_s, net_exit); + write_seqcount_end(&b->count); + + if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) || + !hlist_empty(&b->hhead)) { + WARN_ON_ONCE(net_exit); + return; + } + + if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, + xfrm_pol_inexact_params) == 0) { + list_del(&b->inexact_bins); + kfree_rcu(b, rcu); + } +} + +static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b) +{ + struct net *net = read_pnet(&b->k.net); + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + __xfrm_policy_inexact_prune_bin(b, false); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); +} + +static void __xfrm_policy_inexact_flush(struct net *net) +{ + struct xfrm_pol_inexact_bin *bin, *t; + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins) + __xfrm_policy_inexact_prune_bin(bin, false); +} + +static struct hlist_head * +xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, + struct xfrm_policy *policy, u8 dir) +{ + struct xfrm_pol_inexact_node *n; + struct net *net; + + net = xp_net(policy); + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + if (xfrm_policy_inexact_insert_use_any_list(policy)) + return &bin->hhead; + + if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, + policy->family, + policy->selector.prefixlen_d)) { + write_seqcount_begin(&bin->count); + n = xfrm_policy_inexact_insert_node(net, + &bin->root_s, + &policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s, + dir); + write_seqcount_end(&bin->count); + if (!n) + return NULL; + + return &n->hhead; + } + + /* daddr is fixed */ + write_seqcount_begin(&bin->count); + n = xfrm_policy_inexact_insert_node(net, + &bin->root_d, + &policy->selector.daddr, + policy->family, + policy->selector.prefixlen_d, dir); + write_seqcount_end(&bin->count); + if (!n) + return NULL; + + /* saddr is wildcard */ + if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s)) + return &n->hhead; + + write_seqcount_begin(&bin->count); + n = xfrm_policy_inexact_insert_node(net, + &n->root, + &policy->selector.saddr, + policy->family, + policy->selector.prefixlen_s, dir); + write_seqcount_end(&bin->count); + if (!n) + return NULL; + + return &n->hhead; +} + +static struct xfrm_policy * +xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) +{ + struct xfrm_pol_inexact_bin *bin; + struct xfrm_policy *delpol; + struct hlist_head *chain; + struct net *net; + + bin = xfrm_policy_inexact_alloc_bin(policy, dir); + if (!bin) + return ERR_PTR(-ENOMEM); + + net = xp_net(policy); + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir); + if (!chain) { + __xfrm_policy_inexact_prune_bin(bin, false); + return ERR_PTR(-ENOMEM); + } + + delpol = xfrm_policy_insert_list(chain, policy, excl); + if (delpol && excl) { + __xfrm_policy_inexact_prune_bin(bin, false); + return ERR_PTR(-EEXIST); + } + + chain = &net->xfrm.policy_inexact[dir]; + xfrm_policy_insert_inexact_list(chain, policy); + + if (delpol) + __xfrm_policy_inexact_prune_bin(bin, false); + + return delpol; +} + static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, @@ -592,7 +1236,50 @@ static void xfrm_hash_rebuild(struct work_struct *work) spin_lock_bh(&net->xfrm.xfrm_policy_lock); + /* make sure that we can insert the indirect policies again before + * we start with destructive action. + */ + list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { + struct xfrm_pol_inexact_bin *bin; + u8 dbits, sbits; + + dir = xfrm_policy_id2dir(policy->index); + if (policy->walk.dead || dir >= XFRM_POLICY_MAX) + continue; + + if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { + if (policy->family == AF_INET) { + dbits = rbits4; + sbits = lbits4; + } else { + dbits = rbits6; + sbits = lbits6; + } + } else { + if (policy->family == AF_INET) { + dbits = lbits4; + sbits = rbits4; + } else { + dbits = lbits6; + sbits = rbits6; + } + } + + if (policy->selector.prefixlen_d < dbits || + policy->selector.prefixlen_s < sbits) + continue; + + bin = xfrm_policy_inexact_alloc_bin(policy, dir); + if (!bin) + goto out_unlock; + + if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir)) + goto out_unlock; + } + /* reset the bydst and inexact table in all directions */ + xfrm_hash_reset_inexact_table(net); + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); hmask = net->xfrm.policy_bydst[dir].hmask; @@ -616,15 +1303,23 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { - if (policy->walk.dead || - xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { + if (policy->walk.dead) + continue; + dir = xfrm_policy_id2dir(policy->index); + if (dir >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, - policy->family, - xfrm_policy_id2dir(policy->index)); + policy->family, dir); + if (!chain) { + void *p = xfrm_policy_inexact_insert(policy, dir, 0); + + WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p)); + continue; + } + hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; @@ -637,6 +1332,8 @@ static void xfrm_hash_rebuild(struct work_struct *work) hlist_add_head_rcu(&policy->bydst, chain); } +out_unlock: + __xfrm_policy_inexact_flush(net); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); @@ -740,18 +1437,97 @@ static bool xfrm_policy_mark_match(struct xfrm_policy *policy, return false; } -int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) +static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) { - struct net *net = xp_net(policy); - struct xfrm_policy *pol; - struct xfrm_policy *delpol; - struct hlist_head *chain; - struct hlist_node *newpos; + const struct xfrm_pol_inexact_key *k = data; + u32 a = k->type << 24 | k->dir << 16 | k->family; + + return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)), + seed); +} + +static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) +{ + const struct xfrm_pol_inexact_bin *b = data; + + return xfrm_pol_bin_key(&b->k, 0, seed); +} + +static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xfrm_pol_inexact_key *key = arg->key; + const struct xfrm_pol_inexact_bin *b = ptr; + int ret; + + if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net))) + return -1; + + ret = b->k.dir ^ key->dir; + if (ret) + return ret; + + ret = b->k.type ^ key->type; + if (ret) + return ret; + + ret = b->k.family ^ key->family; + if (ret) + return ret; + + return b->k.if_id ^ key->if_id; +} + +static const struct rhashtable_params xfrm_pol_inexact_params = { + .head_offset = offsetof(struct xfrm_pol_inexact_bin, head), + .hashfn = xfrm_pol_bin_key, + .obj_hashfn = xfrm_pol_bin_obj, + .obj_cmpfn = xfrm_pol_bin_cmp, + .automatic_shrinking = true, +}; + +static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, + struct xfrm_policy *policy) +{ + struct xfrm_policy *pol, *delpol = NULL; + struct hlist_node *newpos = NULL; + int i = 0; + + hlist_for_each_entry(pol, chain, bydst_inexact_list) { + if (pol->type == policy->type && + pol->if_id == policy->if_id && + !selector_cmp(&pol->selector, &policy->selector) && + xfrm_policy_mark_match(policy, pol) && + xfrm_sec_ctx_match(pol->security, policy->security) && + !WARN_ON(delpol)) { + delpol = pol; + if (policy->priority > pol->priority) + continue; + } else if (policy->priority >= pol->priority) { + newpos = &pol->bydst_inexact_list; + continue; + } + if (delpol) + break; + } + + if (newpos) + hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); + else + hlist_add_head_rcu(&policy->bydst_inexact_list, chain); + + hlist_for_each_entry(pol, chain, bydst_inexact_list) { + pol->pos = i; + i++; + } +} + +static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, + struct xfrm_policy *policy, + bool excl) +{ + struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL; - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); - delpol = NULL; - newpos = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && pol->if_id == policy->if_id && @@ -759,24 +1535,45 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_policy_mark_match(policy, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { - if (excl) { - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - return -EEXIST; - } + if (excl) + return ERR_PTR(-EEXIST); delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { - newpos = &pol->bydst; + newpos = pol; continue; } if (delpol) break; } + if (newpos) - hlist_add_behind_rcu(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else hlist_add_head_rcu(&policy->bydst, chain); + + return delpol; +} + +int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) +{ + struct net *net = xp_net(policy); + struct xfrm_policy *delpol; + struct hlist_head *chain; + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); + if (chain) + delpol = xfrm_policy_insert_list(chain, policy, excl); + else + delpol = xfrm_policy_inexact_insert(policy, dir, excl); + + if (IS_ERR(delpol)) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return PTR_ERR(delpol); + } + __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ @@ -806,43 +1603,96 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); +static struct xfrm_policy * +__xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id, + u8 type, int dir, + struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx) +{ + struct xfrm_policy *pol; + + if (!chain) + return NULL; + + hlist_for_each_entry(pol, chain, bydst) { + if (pol->type == type && + pol->if_id == if_id && + (mark & pol->mark.m) == pol->mark.v && + !selector_cmp(sel, &pol->selector) && + xfrm_sec_ctx_match(ctx, pol->security)) + return pol; + } + + return NULL; +} + struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { - struct xfrm_policy *pol, *ret; + struct xfrm_pol_inexact_bin *bin = NULL; + struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); - ret = NULL; - hlist_for_each_entry(pol, chain, bydst) { - if (pol->type == type && - pol->if_id == if_id && - (mark & pol->mark.m) == pol->mark.v && - !selector_cmp(sel, &pol->selector) && - xfrm_sec_ctx_match(ctx, pol->security)) { - xfrm_pol_hold(pol); - if (delete) { - *err = security_xfrm_policy_delete( - pol->security); - if (*err) { - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - return pol; - } - __xfrm_policy_unlink(pol, dir); + if (!chain) { + struct xfrm_pol_inexact_candidates cand; + int i; + + bin = xfrm_policy_inexact_lookup(net, type, + sel->family, dir, if_id); + if (!bin) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return NULL; + } + + if (!xfrm_policy_find_inexact_candidates(&cand, bin, + &sel->saddr, + &sel->daddr)) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return NULL; + } + + pol = NULL; + for (i = 0; i < ARRAY_SIZE(cand.res); i++) { + struct xfrm_policy *tmp; + + tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark, + if_id, type, dir, + sel, ctx); + if (!tmp) + continue; + + if (!pol || tmp->pos < pol->pos) + pol = tmp; + } + } else { + pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir, + sel, ctx); + } + + if (pol) { + xfrm_pol_hold(pol); + if (delete) { + *err = security_xfrm_policy_delete(pol->security); + if (*err) { + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return pol; } - ret = pol; - break; + __xfrm_policy_unlink(pol, dir); } + ret = pol; } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); + if (bin && delete) + xfrm_policy_inexact_prune_bin(bin); return ret; } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); @@ -892,36 +1742,19 @@ EXPORT_SYMBOL(xfrm_policy_byid); static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { - int dir, err = 0; + struct xfrm_policy *pol; + int err = 0; - for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - struct xfrm_policy *pol; - int i; + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + if (pol->walk.dead || + xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || + pol->type != type) + continue; - hlist_for_each_entry(pol, - &net->xfrm.policy_inexact[dir], bydst) { - if (pol->type != type) - continue; - err = security_xfrm_policy_delete(pol->security); - if (err) { - xfrm_audit_policy_delete(pol, 0, task_valid); - return err; - } - } - for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { - hlist_for_each_entry(pol, - net->xfrm.policy_bydst[dir].table + i, - bydst) { - if (pol->type != type) - continue; - err = security_xfrm_policy_delete( - pol->security); - if (err) { - xfrm_audit_policy_delete(pol, 0, - task_valid); - return err; - } - } + err = security_xfrm_policy_delete(pol->security); + if (err) { + xfrm_audit_policy_delete(pol, 0, task_valid); + return err; } } return err; @@ -937,6 +1770,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; + struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); @@ -944,48 +1778,25 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (err) goto out; - for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - struct xfrm_policy *pol; - int i; - - again1: - hlist_for_each_entry(pol, - &net->xfrm.policy_inexact[dir], bydst) { - if (pol->type != type) - continue; - __xfrm_policy_unlink(pol, dir); - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - cnt++; - - xfrm_audit_policy_delete(pol, 1, task_valid); - - xfrm_policy_kill(pol); - - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - goto again1; - } - - for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { - again2: - hlist_for_each_entry(pol, - net->xfrm.policy_bydst[dir].table + i, - bydst) { - if (pol->type != type) - continue; - __xfrm_policy_unlink(pol, dir); - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - cnt++; - - xfrm_audit_policy_delete(pol, 1, task_valid); - xfrm_policy_kill(pol); - - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - goto again2; - } - } +again: + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + dir = xfrm_policy_id2dir(pol->index); + if (pol->walk.dead || + dir >= XFRM_POLICY_MAX || + pol->type != type) + continue; + __xfrm_policy_unlink(pol, dir); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + cnt++; + xfrm_audit_policy_delete(pol, 1, task_valid); + xfrm_policy_kill(pol); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + goto again; } - if (!cnt) + if (cnt) + __xfrm_policy_inexact_flush(net); + else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -1084,21 +1895,189 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, if (match) ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, dir); - return ret; } +static struct xfrm_pol_inexact_node * +xfrm_policy_lookup_inexact_addr(const struct rb_root *r, + seqcount_t *count, + const xfrm_address_t *addr, u16 family) +{ + const struct rb_node *parent; + int seq; + +again: + seq = read_seqcount_begin(count); + + parent = rcu_dereference_raw(r->rb_node); + while (parent) { + struct xfrm_pol_inexact_node *node; + int delta; + + node = rb_entry(parent, struct xfrm_pol_inexact_node, node); + + delta = xfrm_policy_addr_delta(addr, &node->addr, + node->prefixlen, family); + if (delta < 0) { + parent = rcu_dereference_raw(parent->rb_left); + continue; + } else if (delta > 0) { + parent = rcu_dereference_raw(parent->rb_right); + continue; + } + + return node; + } + + if (read_seqcount_retry(count, seq)) + goto again; + + return NULL; +} + +static bool +xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, + struct xfrm_pol_inexact_bin *b, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) +{ + struct xfrm_pol_inexact_node *n; + u16 family; + + if (!b) + return false; + + family = b->k.family; + memset(cand, 0, sizeof(*cand)); + cand->res[XFRM_POL_CAND_ANY] = &b->hhead; + + n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr, + family); + if (n) { + cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; + n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr, + family); + if (n) + cand->res[XFRM_POL_CAND_BOTH] = &n->hhead; + } + + n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr, + family); + if (n) + cand->res[XFRM_POL_CAND_SADDR] = &n->hhead; + + return true; +} + +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, + u8 dir, u32 if_id) +{ + struct xfrm_pol_inexact_key k = { + .family = family, + .type = type, + .dir = dir, + .if_id = if_id, + }; + + write_pnet(&k.net, net); + + return rhashtable_lookup(&xfrm_policy_inexact_table, &k, + xfrm_pol_inexact_params); +} + +static struct xfrm_pol_inexact_bin * +xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, + u8 dir, u32 if_id) +{ + struct xfrm_pol_inexact_bin *bin; + + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); + + rcu_read_lock(); + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); + rcu_read_unlock(); + + return bin; +} + +static struct xfrm_policy * +__xfrm_policy_eval_candidates(struct hlist_head *chain, + struct xfrm_policy *prefer, + const struct flowi *fl, + u8 type, u16 family, int dir, u32 if_id) +{ + u32 priority = prefer ? prefer->priority : ~0u; + struct xfrm_policy *pol; + + if (!chain) + return NULL; + + hlist_for_each_entry_rcu(pol, chain, bydst) { + int err; + + if (pol->priority > priority) + break; + + err = xfrm_policy_match(pol, fl, type, family, dir, if_id); + if (err) { + if (err != -ESRCH) + return ERR_PTR(err); + + continue; + } + + if (prefer) { + /* matches. Is it older than *prefer? */ + if (pol->priority == priority && + prefer->pos < pol->pos) + return prefer; + } + + return pol; + } + + return NULL; +} + +static struct xfrm_policy * +xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, + struct xfrm_policy *prefer, + const struct flowi *fl, + u8 type, u16 family, int dir, u32 if_id) +{ + struct xfrm_policy *tmp; + int i; + + for (i = 0; i < ARRAY_SIZE(cand->res); i++) { + tmp = __xfrm_policy_eval_candidates(cand->res[i], + prefer, + fl, type, family, dir, + if_id); + if (!tmp) + continue; + + if (IS_ERR(tmp)) + return tmp; + prefer = tmp; + } + + return prefer; +} + static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { - int err; - struct xfrm_policy *pol, *ret; + struct xfrm_pol_inexact_candidates cand; const xfrm_address_t *daddr, *saddr; + struct xfrm_pol_inexact_bin *bin; + struct xfrm_policy *pol, *ret; struct hlist_head *chain; unsigned int sequence; u32 priority; + int err; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); @@ -1129,25 +2108,20 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, break; } } - chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry_rcu(pol, chain, bydst) { - if ((pol->priority >= priority) && ret) - break; + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); + if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, + daddr)) + goto skip_inexact; - err = xfrm_policy_match(pol, fl, type, family, dir, if_id); - if (err) { - if (err == -ESRCH) - continue; - else { - ret = ERR_PTR(err); - goto fail; - } - } else { - ret = pol; - break; - } + pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, + family, dir, if_id); + if (pol) { + ret = pol; + if (IS_ERR(pol)) + goto fail; } +skip_inexact: if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) goto retry; @@ -1239,6 +2213,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); + hlist_del_init(&pol->bydst_inexact_list); hlist_del(&pol->byidx); } @@ -1811,7 +2786,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) pq->timeout = pq->timeout << 1; if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout)) xfrm_pol_hold(pol); - goto out; + goto out; } dst_release(dst); @@ -2816,13 +3791,17 @@ static void xfrm_statistics_fini(struct net *net) static int __net_init xfrm_policy_init(struct net *net) { unsigned int hmask, sz; - int dir; + int dir, err; - if (net_eq(net, &init_net)) + if (net_eq(net, &init_net)) { xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", sizeof(struct xfrm_dst), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + err = rhashtable_init(&xfrm_policy_inexact_table, + &xfrm_pol_inexact_params); + BUG_ON(err); + } hmask = 8 - 1; sz = (hmask+1) * sizeof(struct hlist_head); @@ -2857,6 +3836,7 @@ static int __net_init xfrm_policy_init(struct net *net) seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); + INIT_LIST_HEAD(&net->xfrm.inexact_bins); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); return 0; @@ -2875,6 +3855,7 @@ out_byidx: static void xfrm_policy_fini(struct net *net) { + struct xfrm_pol_inexact_bin *b, *t; unsigned int sz; int dir; @@ -2900,6 +3881,11 @@ static void xfrm_policy_fini(struct net *net) sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins) + __xfrm_policy_inexact_prune_bin(b, true); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static int __net_init xfrm_net_init(struct net *net) @@ -3065,7 +4051,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * } } chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, chain, bydst) { + hlist_for_each_entry(pol, chain, bydst_inexact_list) { if ((pol->priority >= priority) && ret) break; diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index aeecc3ef53d0..9543a4c2f9be 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -4,7 +4,8 @@ CFLAGS = -Wall -Wl,--no-as-needed -O2 -g CFLAGS += -I../../../../usr/include/ -TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh +TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \ + rtnetlink.sh xfrm_policy.sh TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh new file mode 100755 index 000000000000..8db35b99457c --- /dev/null +++ b/tools/testing/selftests/net/xfrm_policy.sh @@ -0,0 +1,302 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Check xfrm policy resolution. Topology: +# +# 1.2 1.1 3.1 3.10 2.1 2.2 +# eth1 eth1 veth0 veth0 eth1 eth1 +# ns1 ---- ns3 ----- ns4 ---- ns2 +# +# ns3 and ns4 are connected via ipsec tunnel. +# pings from ns1 to ns2 (and vice versa) are supposed to work like this: +# ns1: ping 10.0.2.2: passes via ipsec tunnel. +# ns2: ping 10.0.1.2: passes via ipsec tunnel. + +# ns1: ping 10.0.1.253: passes via ipsec tunnel (direct policy) +# ns2: ping 10.0.2.253: passes via ipsec tunnel (direct policy) +# +# ns1: ping 10.0.2.254: does NOT pass via ipsec tunnel (exception) +# ns2: ping 10.0.1.254: does NOT pass via ipsec tunnel (exception) + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 +policy_checks_ok=1 + +KEY_SHA=0xdeadbeef1234567890abcdefabcdefabcdefabcd +KEY_AES=0x0123456789abcdef0123456789012345 +SPI1=0x1 +SPI2=0x2 + +do_esp() { + local ns=$1 + local me=$2 + local remote=$3 + local lnet=$4 + local rnet=$5 + local spi_out=$6 + local spi_in=$7 + + ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet + ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet + + # to encrypt packets as they go out (includes forwarded packets that need encapsulation) + ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 100 action allow + # to fwd decrypted packets after esp processing: + ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 100 action allow +} + +do_esp_policy_get_check() { + local ns=$1 + local lnet=$2 + local rnet=$3 + + ip -net $ns xfrm policy get src $lnet dst $rnet dir out > /dev/null + if [ $? -ne 0 ] && [ $policy_checks_ok -eq 1 ] ;then + policy_checks_ok=0 + echo "FAIL: ip -net $ns xfrm policy get src $lnet dst $rnet dir out" + ret=1 + fi + + ip -net $ns xfrm policy get src $rnet dst $lnet dir fwd > /dev/null + if [ $? -ne 0 ] && [ $policy_checks_ok -eq 1 ] ;then + policy_checks_ok=0 + echo "FAIL: ip -net $ns xfrm policy get src $rnet dst $lnet dir fwd" + ret=1 + fi +} + +do_exception() { + local ns=$1 + local me=$2 + local remote=$3 + local encryptip=$4 + local plain=$5 + + # network $plain passes without tunnel + ip -net $ns xfrm policy add dst $plain dir out priority 10 action allow + + # direct policy for $encryptip, use tunnel, higher prio takes precedence + ip -net $ns xfrm policy add dst $encryptip dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow +} + +# policies that are not supposed to match any packets generated in this test. +do_dummies4() { + local ns=$1 + + for i in $(seq 10 16);do + # dummy policy with wildcard src/dst. + echo netns exec $ns ip xfrm policy add src 0.0.0.0/0 dst 10.$i.99.0/30 dir out action block + echo netns exec $ns ip xfrm policy add src 10.$i.99.0/30 dst 0.0.0.0/0 dir out action block + for j in $(seq 32 64);do + echo netns exec $ns ip xfrm policy add src 10.$i.1.0/30 dst 10.$i.$j.0/30 dir out action block + # silly, as it encompasses the one above too, but its allowed: + echo netns exec $ns ip xfrm policy add src 10.$i.1.0/29 dst 10.$i.$j.0/29 dir out action block + # and yet again, even more broad one. + echo netns exec $ns ip xfrm policy add src 10.$i.1.0/24 dst 10.$i.$j.0/24 dir out action block + echo netns exec $ns ip xfrm policy add src 10.$i.$j.0/24 dst 10.$i.1.0/24 dir fwd action block + done + done | ip -batch /dev/stdin +} + +do_dummies6() { + local ns=$1 + + for i in $(seq 10 16);do + for j in $(seq 32 64);do + echo netns exec $ns ip xfrm policy add src dead:$i::/64 dst dead:$i:$j::/64 dir out action block + echo netns exec $ns ip xfrm policy add src dead:$i:$j::/64 dst dead:$i::/24 dir fwd action block + done + done | ip -batch /dev/stdin +} + +check_ipt_policy_count() +{ + ns=$1 + + ip netns exec $ns iptables-save -c |grep policy | ( read c rest + ip netns exec $ns iptables -Z + if [ x"$c" = x'[0:0]' ]; then + exit 0 + elif [ x"$c" = x ]; then + echo "ERROR: No counters" + ret=1 + exit 111 + else + exit 1 + fi + ) +} + +check_xfrm() { + # 0: iptables -m policy rule count == 0 + # 1: iptables -m policy rule count != 0 + rval=$1 + ip=$2 + lret=0 + + ip netns exec ns1 ping -q -c 1 10.0.2.$ip > /dev/null + + check_ipt_policy_count ns3 + if [ $? -ne $rval ] ; then + lret=1 + fi + check_ipt_policy_count ns4 + if [ $? -ne $rval ] ; then + lret=1 + fi + + ip netns exec ns2 ping -q -c 1 10.0.1.$ip > /dev/null + + check_ipt_policy_count ns3 + if [ $? -ne $rval ] ; then + lret=1 + fi + check_ipt_policy_count ns4 + if [ $? -ne $rval ] ; then + lret=1 + fi + + return $lret +} + +#check for needed privileges +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +ip -Version 2>/dev/null >/dev/null +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without the ip tool" + exit $ksft_skip +fi + +# needed to check if policy lookup got valid ipsec result +iptables --version 2>/dev/null >/dev/null +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iptables tool" + exit $ksft_skip +fi + +for i in 1 2 3 4; do + ip netns add ns$i + ip -net ns$i link set lo up +done + +DEV=veth0 +ip link add $DEV netns ns1 type veth peer name eth1 netns ns3 +ip link add $DEV netns ns2 type veth peer name eth1 netns ns4 + +ip link add $DEV netns ns3 type veth peer name veth0 netns ns4 + +DEV=veth0 +for i in 1 2; do + ip -net ns$i link set $DEV up + ip -net ns$i addr add 10.0.$i.2/24 dev $DEV + ip -net ns$i addr add dead:$i::2/64 dev $DEV + + ip -net ns$i addr add 10.0.$i.253 dev $DEV + ip -net ns$i addr add 10.0.$i.254 dev $DEV + ip -net ns$i addr add dead:$i::fd dev $DEV + ip -net ns$i addr add dead:$i::fe dev $DEV +done + +for i in 3 4; do +ip -net ns$i link set eth1 up +ip -net ns$i link set veth0 up +done + +ip -net ns1 route add default via 10.0.1.1 +ip -net ns2 route add default via 10.0.2.1 + +ip -net ns3 addr add 10.0.1.1/24 dev eth1 +ip -net ns3 addr add 10.0.3.1/24 dev veth0 +ip -net ns3 addr add 2001:1::1/64 dev eth1 +ip -net ns3 addr add 2001:3::1/64 dev veth0 + +ip -net ns3 route add default via 10.0.3.10 + +ip -net ns4 addr add 10.0.2.1/24 dev eth1 +ip -net ns4 addr add 10.0.3.10/24 dev veth0 +ip -net ns4 addr add 2001:2::1/64 dev eth1 +ip -net ns4 addr add 2001:3::10/64 dev veth0 +ip -net ns4 route add default via 10.0.3.1 + +for j in 4 6; do + for i in 3 4;do + ip netns exec ns$i sysctl net.ipv$j.conf.eth1.forwarding=1 > /dev/null + ip netns exec ns$i sysctl net.ipv$j.conf.veth0.forwarding=1 > /dev/null + done +done + +# abuse iptables rule counter to check if ping matches a policy +ip netns exec ns3 iptables -p icmp -A FORWARD -m policy --dir out --pol ipsec +ip netns exec ns4 iptables -p icmp -A FORWARD -m policy --dir out --pol ipsec +if [ $? -ne 0 ];then + echo "SKIP: Could not insert iptables rule" + for i in 1 2 3 4;do ip netns del ns$i;done + exit $ksft_skip +fi + +# localip remoteip localnet remotenet +do_esp ns3 10.0.3.1 10.0.3.10 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 +do_esp ns3 dead:3::1 dead:3::10 dead:1::/64 dead:2::/64 $SPI1 $SPI2 +do_esp ns4 10.0.3.10 10.0.3.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 +do_esp ns4 dead:3::10 dead:3::1 dead:2::/64 dead:1::/64 $SPI2 $SPI1 + +do_dummies4 ns3 +do_dummies6 ns4 + +do_esp_policy_get_check ns3 10.0.1.0/24 10.0.2.0/24 +do_esp_policy_get_check ns4 10.0.2.0/24 10.0.1.0/24 +do_esp_policy_get_check ns3 dead:1::/64 dead:2::/64 +do_esp_policy_get_check ns4 dead:2::/64 dead:1::/64 + +# ping to .254 should use ipsec, exception is not installed. +check_xfrm 1 254 +if [ $? -ne 0 ]; then + echo "FAIL: expected ping to .254 to use ipsec tunnel" + ret=1 +else + echo "PASS: policy before exception matches" +fi + +# installs exceptions +# localip remoteip encryptdst plaindst +do_exception ns3 10.0.3.1 10.0.3.10 10.0.2.253 10.0.2.240/28 +do_exception ns4 10.0.3.10 10.0.3.1 10.0.1.253 10.0.1.240/28 + +do_exception ns3 dead:3::1 dead:3::10 dead:2::fd dead:2:f0::/96 +do_exception ns4 dead:3::10 dead:3::1 dead:1::fd dead:1:f0::/96 + +# ping to .254 should now be excluded from the tunnel +check_xfrm 0 254 +if [ $? -ne 0 ]; then + echo "FAIL: expected ping to .254 to fail" + ret=1 +else + echo "PASS: ping to .254 bypassed ipsec tunnel" +fi + +# ping to .253 should use use ipsec due to direct policy exception. +check_xfrm 1 253 +if [ $? -ne 0 ]; then + echo "FAIL: expected ping to .253 to use ipsec tunnel" + ret=1 +else + echo "PASS: direct policy matches" +fi + +# ping to .2 should use ipsec. +check_xfrm 1 2 +if [ $? -ne 0 ]; then + echo "FAIL: expected ping to .2 to use ipsec tunnel" + ret=1 +else + echo "PASS: policy matches" +fi + +for i in 1 2 3 4;do ip netns del ns$i;done + +exit $ret |