net: sched: do not acquire qdisc spinlock in qdisc/class stats dump

Large tc dumps (tc -s {qdisc|class} sh dev ethX) done by Google BwE host agent [1] are problematic at scale : For each qdisc/class found in the dump, we currently lock the root qdisc spinlock in order to get stats. Sampling stats every 5 seconds from thousands of HTB classes is a challenge when the root qdisc spinlock is under high pressure. Not only the dumps take time, they also slow down the fast path (queue/dequeue packets) by 10 % to 20 % in some cases. An audit of existing qdiscs showed that sch_fq_codel is the only qdisc that might need the qdisc lock in fq_codel_dump_stats() and fq_codel_dump_class_stats() In v2 of this patch, I now use the Qdisc running seqcount to provide consistent reads of packets/bytes counters, regardless of 32/64 bit arches. I also changed rate estimators to use the same infrastructure so that they no longer need to lock root qdisc lock. [1] http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43838.pdf Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: John Fastabend <john.fastabend@gmail.com> Cc: Kevin Athey <kda@google.com> Cc: Xiaotian Pei <xiaotian@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2016-06-06 09:37:16 -0700
committer: David S. Miller <davem@davemloft.net> 2016-06-07 16:37:14 -0700
commit: edb09eb17ed89eaa82a52dd306beac93e292b485 (patch)
tree: 1f241506d6b781b65d1033925f1c1ce6a39c3394 /net/core
parent: f9eb8aea2a1e12fc2f584d1627deeb957435a801 (diff)
2 files changed, 39 insertions, 19 deletions
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 4573d81093fe..cad8e791f28e 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -84,6 +84,7 @@ struct gen_estimator
 	struct gnet_stats_basic_packed	*bstats;
 	struct gnet_stats_rate_est64	*rate_est;
 	spinlock_t		*stats_lock;
+	seqcount_t		*running;
 	int			ewma_log;
 	u32			last_packets;
 	unsigned long		avpps;
@@ -121,26 +122,28 @@ static void est_timer(unsigned long arg)
 		unsigned long rate;
 		u64 brate;
 
-		spin_lock(e->stats_lock);
+		if (e->stats_lock)
+			spin_lock(e->stats_lock);
 		read_lock(&est_lock);
 		if (e->bstats == NULL)
 			goto skip;
 
-		__gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
+		__gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
 
 		brate = (b.bytes - e->last_bytes)<<(7 - idx);
 		e->last_bytes = b.bytes;
 		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
-		e->rate_est->bps = (e->avbps+0xF)>>5;
+		WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
 
 		rate = b.packets - e->last_packets;
 		rate <<= (7 - idx);
 		e->last_packets = b.packets;
 		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
-		e->rate_est->pps = (e->avpps + 0xF) >> 5;
+		WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
 skip:
 		read_unlock(&est_lock);
-		spin_unlock(e->stats_lock);
+		if (e->stats_lock)
+			spin_unlock(e->stats_lock);
 	}
 
 	if (!list_empty(&elist[idx].list))
@@ -194,6 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
  * @cpu_bstats: bstats per cpu
  * @rate_est: rate estimator statistics
  * @stats_lock: statistics lock
+ * @running: qdisc running seqcount
  * @opt: rate estimator configuration TLV
  *
  * Creates a new rate estimator with &bstats as source and &rate_est
@@ -209,6 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 		      struct gnet_stats_rate_est64 *rate_est,
 		      spinlock_t *stats_lock,
+		      seqcount_t *running,
 		      struct nlattr *opt)
 {
 	struct gen_estimator *est;
@@ -226,12 +231,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	if (est == NULL)
 		return -ENOBUFS;
 
-	__gnet_stats_copy_basic(&b, cpu_bstats, bstats);
+	__gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
 
 	idx = parm->interval + 2;
 	est->bstats = bstats;
 	est->rate_est = rate_est;
 	est->stats_lock = stats_lock;
+	est->running  = running;
 	est->ewma_log = parm->ewma_log;
 	est->last_bytes = b.bytes;
 	est->avbps = rate_est->bps<<5;
@@ -291,6 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
  * @cpu_bstats: bstats per cpu
  * @rate_est: rate estimator statistics
  * @stats_lock: statistics lock
+ * @running: qdisc running seqcount (might be NULL)
  * @opt: rate estimator configuration TLV
  *
  * Replaces the configuration of a rate estimator by calling
@@ -301,10 +308,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 			  struct gnet_stats_rate_est64 *rate_est,
-			  spinlock_t *stats_lock, struct nlattr *opt)
+			  spinlock_t *stats_lock,
+			  seqcount_t *running, struct nlattr *opt)
 {
 	gen_kill_estimator(bstats, rate_est);
-	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
+	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
 }
 EXPORT_SYMBOL(gen_replace_estimator);
 
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index f96ee8b9478d..d9c210caff32 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -32,10 +32,11 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
 	return 0;
 
 nla_put_failure:
+	if (d->lock)
+		spin_unlock_bh(d->lock);
 	kfree(d->xstats);
 	d->xstats = NULL;
 	d->xstats_len = 0;
-	spin_unlock_bh(d->lock);
 	return -1;
 }
 
@@ -65,15 +66,16 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
 {
 	memset(d, 0, sizeof(*d));
 
-	spin_lock_bh(lock);
-	d->lock = lock;
 	if (type)
 		d->tail = (struct nlattr *)skb_tail_pointer(skb);
 	d->skb = skb;
 	d->compat_tc_stats = tc_stats_type;
 	d->compat_xstats = xstats_type;
 	d->padattr = padattr;
-
+	if (lock) {
+		d->lock = lock;
+		spin_lock_bh(lock);
+	}
 	if (d->tail)
 		return gnet_stats_copy(d, type, NULL, 0, padattr);
 
@@ -126,16 +128,23 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
 }
 
 void
-__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+__gnet_stats_copy_basic(const seqcount_t *running,
+			struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_basic_cpu __percpu *cpu,
 			struct gnet_stats_basic_packed *b)
 {
+	unsigned int seq;
+
 	if (cpu) {
 		__gnet_stats_copy_basic_cpu(bstats, cpu);
-	} else {
+		return;
+	}
+	do {
+		if (running)
+			seq = read_seqcount_begin(running);
 		bstats->bytes = b->bytes;
 		bstats->packets = b->packets;
-	}
+	} while (running && read_seqcount_retry(running, seq));
 }
 EXPORT_SYMBOL(__gnet_stats_copy_basic);
 
@@ -152,13 +161,14 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
  * if the room in the socket buffer was not sufficient.
  */
 int
-gnet_stats_copy_basic(struct gnet_dump *d,
+gnet_stats_copy_basic(const seqcount_t *running,
+		      struct gnet_dump *d,
 		      struct gnet_stats_basic_cpu __percpu *cpu,
 		      struct gnet_stats_basic_packed *b)
 {
 	struct gnet_stats_basic_packed bstats = {0};
 
-	__gnet_stats_copy_basic(&bstats, cpu, b);
+	__gnet_stats_copy_basic(running, &bstats, cpu, b);
 
 	if (d->compat_tc_stats) {
 		d->tc_stats.bytes = bstats.bytes;
@@ -328,8 +338,9 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
 	return 0;
 
 err_out:
+	if (d->lock)
+		spin_unlock_bh(d->lock);
 	d->xstats_len = 0;
-	spin_unlock_bh(d->lock);
 	return -1;
 }
 EXPORT_SYMBOL(gnet_stats_copy_app);
@@ -363,10 +374,11 @@ gnet_stats_finish_copy(struct gnet_dump *d)
 			return -1;
 	}
 
+	if (d->lock)
+		spin_unlock_bh(d->lock);
 	kfree(d->xstats);
 	d->xstats = NULL;
 	d->xstats_len = 0;
-	spin_unlock_bh(d->lock);
 	return 0;
 }
 EXPORT_SYMBOL(gnet_stats_finish_copy);
author	Eric Dumazet <edumazet@google.com>	2016-06-06 09:37:16 -0700
committer	David S. Miller <davem@davemloft.net>	2016-06-07 16:37:14 -0700
commit	edb09eb17ed89eaa82a52dd306beac93e292b485 (patch)
tree	1f241506d6b781b65d1033925f1c1ce6a39c3394 /net/core
parent	f9eb8aea2a1e12fc2f584d1627deeb957435a801 (diff)