net: remove dev_txq_stats_fold()

After recent changes, (percpu stats on vlan/tunnels...), we dont need
anymore per struct netdev_queue tx_bytes/tx_packets/tx_dropped counters.

Only remaining users are ixgbe, sch_teql, gianfar & macvlan :

1) ixgbe can be converted to use existing tx_ring counters.

2) macvlan incremented txq->tx_dropped, it can use the
dev->stats.tx_dropped counter.

3) sch_teql : almost revert ab35cd4b8f42 (Use net_device internal stats)
    Now we have ndo_get_stats64(), use it, even for "unsigned long"
fields (No need to bring back a struct net_device_stats)

4) gianfar adds a stats structure per tx queue to hold
tx_bytes/tx_packets

This removes a lockdep warning (and possible lockup) in rndis gadget,
calling dev_get_stats() from hard IRQ context.

Ref: http://www.spinics.net/lists/netdev/msg149202.html

Reported-by: Neil Jones <neiljay@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Jarek Poplawski <jarkao2@gmail.com>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
CC: Sandeep Gopalpet <sandeep.kumar@freescale.com>
CC: Michal Nazarewicz <mina86@mina86.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 45c4b7b..f1d4b45 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -433,7 +433,6 @@
 static struct net_device_stats *gfar_get_stats(struct net_device *dev)
 {
 	struct gfar_private *priv = netdev_priv(dev);
-	struct netdev_queue *txq;
 	unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
 	unsigned long tx_packets = 0, tx_bytes = 0;
 	int i = 0;
@@ -449,9 +448,8 @@
 	dev->stats.rx_dropped = rx_dropped;
 
 	for (i = 0; i < priv->num_tx_queues; i++) {
-		txq = netdev_get_tx_queue(dev, i);
-		tx_bytes += txq->tx_bytes;
-		tx_packets += txq->tx_packets;
+		tx_bytes += priv->tx_queue[i]->stats.tx_bytes;
+		tx_packets += priv->tx_queue[i]->stats.tx_packets;
 	}
 
 	dev->stats.tx_bytes = tx_bytes;
@@ -2108,8 +2106,8 @@
 	}
 
 	/* Update transmit stats */
-	txq->tx_bytes += skb->len;
-	txq->tx_packets ++;
+	tx_queue->stats.tx_bytes += skb->len;
+	tx_queue->stats.tx_packets++;
 
 	txbdp = txbdp_start = tx_queue->cur_tx;
 	lstatus = txbdp->lstatus;
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 68984eb..54de413 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -907,12 +907,21 @@
 	MQ_MG_MODE
 };
 
+/*
+ * Per TX queue stats
+ */
+struct tx_q_stats {
+	unsigned long tx_packets;
+	unsigned long tx_bytes;
+};
+
 /**
  *	struct gfar_priv_tx_q - per tx queue structure
  *	@txlock: per queue tx spin lock
  *	@tx_skbuff:skb pointers
  *	@skb_curtx: to be used skb pointer
  *	@skb_dirtytx:the last used skb pointer
+ *	@stats: bytes/packets stats
  *	@qindex: index of this queue
  *	@dev: back pointer to the dev structure
  *	@grp: back pointer to the group to which this queue belongs
@@ -934,6 +943,7 @@
 	struct	txbd8 *tx_bd_base;
 	struct	txbd8 *cur_tx;
 	struct	txbd8 *dirty_tx;
+	struct tx_q_stats stats;
 	struct	net_device *dev;
 	struct gfar_priv_grp *grp;
 	u16	skb_curtx;
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index a060610..602078b 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -6667,8 +6667,6 @@
 			  struct ixgbe_adapter *adapter,
 			  struct ixgbe_ring *tx_ring)
 {
-	struct net_device *netdev = tx_ring->netdev;
-	struct netdev_queue *txq;
 	unsigned int first;
 	unsigned int tx_flags = 0;
 	u8 hdr_len = 0;
@@ -6765,9 +6763,6 @@
 		/* add the ATR filter if ATR is on */
 		if (test_bit(__IXGBE_TX_FDIR_INIT_DONE, &tx_ring->state))
 			ixgbe_atr(tx_ring, skb, tx_flags, protocol);
-		txq = netdev_get_tx_queue(netdev, tx_ring->queue_index);
-		txq->tx_bytes += skb->len;
-		txq->tx_packets++;
 		ixgbe_tx_queue(tx_ring, tx_flags, count, skb->len, hdr_len);
 		ixgbe_maybe_stop_tx(tx_ring, DESC_NEEDED);
 
@@ -6925,8 +6920,6 @@
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	int i;
 
-	/* accurate rx/tx bytes/packets stats */
-	dev_txq_stats_fold(netdev, stats);
 	rcu_read_lock();
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		struct ixgbe_ring *ring = ACCESS_ONCE(adapter->rx_ring[i]);
@@ -6943,6 +6936,22 @@
 			stats->rx_bytes   += bytes;
 		}
 	}
+
+	for (i = 0; i < adapter->num_tx_queues; i++) {
+		struct ixgbe_ring *ring = ACCESS_ONCE(adapter->tx_ring[i]);
+		u64 bytes, packets;
+		unsigned int start;
+
+		if (ring) {
+			do {
+				start = u64_stats_fetch_begin_bh(&ring->syncp);
+				packets = ring->stats.packets;
+				bytes   = ring->stats.bytes;
+			} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+			stats->tx_packets += packets;
+			stats->tx_bytes   += bytes;
+		}
+	}
 	rcu_read_unlock();
 	/* following stats updated by ixgbe_watchdog_task() */
 	stats->multicast	= netdev->stats.multicast;
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 21845af..5933621 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -585,7 +585,7 @@
 	rcu_read_lock_bh();
 	vlan = rcu_dereference(q->vlan);
 	if (vlan)
-		netdev_get_tx_queue(vlan->dev, 0)->tx_dropped++;
+		vlan->dev->stats.tx_dropped++;
 	rcu_read_unlock_bh();
 
 	return err;