net: sched: make bstats per cpu and estimator RCU safe

In order to run qdisc's without locking statistics and estimators
need to be handled correctly.

To resolve bstats make the statistics per cpu. And because this is
only needed for qdiscs that are running without locks which is not
the case for most qdiscs in the near future only create percpu
stats when qdiscs set the TCQ_F_CPUSTATS flag.

Next because estimators use the bstats to calculate packets per
second and bytes per second the estimator code paths are updated
to use the per cpu statistics.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9d33dff..9dfb88a 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -91,6 +91,8 @@
 	u32			avpps;
 	struct rcu_head		e_rcu;
 	struct rb_node		node;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	struct rcu_head		head;
 };
 
 struct gen_estimator_head
@@ -115,9 +117,8 @@
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(e, &elist[idx].list, list) {
-		u64 nbytes;
+		struct gnet_stats_basic_packed b = {0};
 		u64 brate;
-		u32 npackets;
 		u32 rate;
 
 		spin_lock(e->stats_lock);
@@ -125,15 +126,15 @@
 		if (e->bstats == NULL)
 			goto skip;
 
-		nbytes = e->bstats->bytes;
-		npackets = e->bstats->packets;
-		brate = (nbytes - e->last_bytes)<<(7 - idx);
-		e->last_bytes = nbytes;
+		__gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
+
+		brate = (b.bytes - e->last_bytes)<<(7 - idx);
+		e->last_bytes = b.bytes;
 		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
 		e->rate_est->bps = (e->avbps+0xF)>>5;
 
-		rate = (npackets - e->last_packets)<<(12 - idx);
-		e->last_packets = npackets;
+		rate = (b.packets - e->last_packets)<<(12 - idx);
+		e->last_packets = b.packets;
 		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
 		e->rate_est->pps = (e->avpps+0x1FF)>>10;
 skip:
@@ -203,12 +204,14 @@
  *
  */
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 		      struct gnet_stats_rate_est64 *rate_est,
 		      spinlock_t *stats_lock,
 		      struct nlattr *opt)
 {
 	struct gen_estimator *est;
 	struct gnet_estimator *parm = nla_data(opt);
+	struct gnet_stats_basic_packed b = {0};
 	int idx;
 
 	if (nla_len(opt) < sizeof(*parm))
@@ -221,15 +224,18 @@
 	if (est == NULL)
 		return -ENOBUFS;
 
+	__gnet_stats_copy_basic(&b, cpu_bstats, bstats);
+
 	idx = parm->interval + 2;
 	est->bstats = bstats;
 	est->rate_est = rate_est;
 	est->stats_lock = stats_lock;
 	est->ewma_log = parm->ewma_log;
-	est->last_bytes = bstats->bytes;
+	est->last_bytes = b.bytes;
 	est->avbps = rate_est->bps<<5;
-	est->last_packets = bstats->packets;
+	est->last_packets = b.packets;
 	est->avpps = rate_est->pps<<10;
+	est->cpu_bstats = cpu_bstats;
 
 	spin_lock_bh(&est_tree_lock);
 	if (!elist[idx].timer.function) {
@@ -290,11 +296,12 @@
  * Returns 0 on success or a negative error code.
  */
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
 			  struct gnet_stats_rate_est64 *rate_est,
 			  spinlock_t *stats_lock, struct nlattr *opt)
 {
 	gen_kill_estimator(bstats, rate_est);
-	return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
 }
 EXPORT_SYMBOL(gen_replace_estimator);