netfilter: nf_conntrack: use SLAB_DESTROY_BY_RCU and get rid of call_rcu()

Use "hlist_nulls" infrastructure we added in 2.6.29 for RCUification of UDP & TCP. This permits an easy conversion from call_rcu() based hash lists to a SLAB_DESTROY_BY_RCU one. Avoiding call_rcu() delay at nf_conn freeing time has numerous gains. First, it doesnt fill RCU queues (up to 10000 elements per cpu). This reduces OOM possibility, if queued elements are not taken into account This reduces latency problems when RCU queue size hits hilimit and triggers emergency mode. - It allows fast reuse of just freed elements, permitting better use of CPU cache. - We delete rcu_head from "struct nf_conn", shrinking size of this structure by 8 or 16 bytes. This patch only takes care of "struct nf_conn". call_rcu() is still used for less critical conntrack parts, that may be converted later if necessary. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: Patrick McHardy <kaber@trash.net>
author: Eric Dumazet <dada1@cosmosbay.com> 2009-03-25 21:05:46 +0100
committer: Patrick McHardy <kaber@trash.net> 2009-03-25 21:05:46 +0100
commit: ea781f197d6a835cbb93a0bf88ee1696296ed8aa (patch)
tree: 820fe7df1199d8bb6c793e664e480ea56ecf612e /net/ipv4
parent: 1f9352ae2253a97b07b34dcf16ffa3b4ca12c558 (diff)
2 files changed, 37 insertions, 28 deletions
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 6ba5c557690c..8668a3defda6 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -25,40 +25,42 @@ struct ct_iter_state {
 	unsigned int bucket;
 };
 
-static struct hlist_node *ct_get_first(struct seq_file *seq)
+static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 {
 	struct net *net = seq_file_net(seq);
 	struct ct_iter_state *st = seq->private;
-	struct hlist_node *n;
+	struct hlist_nulls_node *n;
 
 	for (st->bucket = 0;
 	     st->bucket < nf_conntrack_htable_size;
 	     st->bucket++) {
 		n = rcu_dereference(net->ct.hash[st->bucket].first);
-		if (n)
+		if (!is_a_nulls(n))
 			return n;
 	}
 	return NULL;
 }
 
-static struct hlist_node *ct_get_next(struct seq_file *seq,
-				      struct hlist_node *head)
+static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
+				      struct hlist_nulls_node *head)
 {
 	struct net *net = seq_file_net(seq);
 	struct ct_iter_state *st = seq->private;
 
 	head = rcu_dereference(head->next);
-	while (head == NULL) {
-		if (++st->bucket >= nf_conntrack_htable_size)
-			return NULL;
+	while (is_a_nulls(head)) {
+		if (likely(get_nulls_value(head) == st->bucket)) {
+			if (++st->bucket >= nf_conntrack_htable_size)
+				return NULL;
+		}
 		head = rcu_dereference(net->ct.hash[st->bucket].first);
 	}
 	return head;
 }
 
-static struct hlist_node *ct_get_idx(struct seq_file *seq, loff_t pos)
+static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
 {
-	struct hlist_node *head = ct_get_first(seq);
+	struct hlist_nulls_node *head = ct_get_first(seq);
 
 	if (head)
 		while (pos && (head = ct_get_next(seq, head)))
@@ -87,69 +89,76 @@ static void ct_seq_stop(struct seq_file *s, void *v)
 
 static int ct_seq_show(struct seq_file *s, void *v)
 {
-	const struct nf_conntrack_tuple_hash *hash = v;
-	const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
+	struct nf_conntrack_tuple_hash *hash = v;
+	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
 	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
+	int ret = 0;
 
 	NF_CT_ASSERT(ct);
+	if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+		return 0;
+
 
 	/* we only want to print DIR_ORIGINAL */
 	if (NF_CT_DIRECTION(hash))
-		return 0;
+		goto release;
 	if (nf_ct_l3num(ct) != AF_INET)
-		return 0;
+		goto release;
 
 	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
 	NF_CT_ASSERT(l3proto);
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	NF_CT_ASSERT(l4proto);
 
+	ret = -ENOSPC;
 	if (seq_printf(s, "%-8s %u %ld ",
 		      l4proto->name, nf_ct_protonum(ct),
 		      timer_pending(&ct->timeout)
 		      ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
-		return -ENOSPC;
+		goto release;
 
 	if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct))
-		return -ENOSPC;
+		goto release;
 
 	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 			l3proto, l4proto))
-		return -ENOSPC;
+		goto release;
 
 	if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
-		return -ENOSPC;
+		goto release;
 
 	if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
 		if (seq_printf(s, "[UNREPLIED] "))
-			return -ENOSPC;
+			goto release;
 
 	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 			l3proto, l4proto))
-		return -ENOSPC;
+		goto release;
 
 	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
-		return -ENOSPC;
+		goto release;
 
 	if (test_bit(IPS_ASSURED_BIT, &ct->status))
 		if (seq_printf(s, "[ASSURED] "))
-			return -ENOSPC;
+			goto release;
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
 	if (seq_printf(s, "mark=%u ", ct->mark))
-		return -ENOSPC;
+		goto release;
 #endif
 
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
 	if (seq_printf(s, "secmark=%u ", ct->secmark))
-		return -ENOSPC;
+		goto release;
 #endif
 
 	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
-		return -ENOSPC;
-
-	return 0;
+		goto release;
+	ret = 0;
+release:
+	nf_ct_put(ct);
+	return ret;
 }
 
 static const struct seq_operations ct_seq_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index a65cf692359f..fe65187810f0 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -679,7 +679,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
 static int __net_init nf_nat_net_init(struct net *net)
 {
 	net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
-						      &net->ipv4.nat_vmalloced);
+						      &net->ipv4.nat_vmalloced, 0);
 	if (!net->ipv4.nat_bysource)
 		return -ENOMEM;
 	return 0;
author	Eric Dumazet <dada1@cosmosbay.com>	2009-03-25 21:05:46 +0100
committer	Patrick McHardy <kaber@trash.net>	2009-03-25 21:05:46 +0100
commit	ea781f197d6a835cbb93a0bf88ee1696296ed8aa (patch)
tree	820fe7df1199d8bb6c793e664e480ea56ecf612e /net/ipv4
parent	1f9352ae2253a97b07b34dcf16ffa3b4ca12c558 (diff)