chelsio: more rx speedup

Cleanup receive processing some more:
   * do the reserve padding of skb during setup
   * don't pass constants to get_packet
   * do smart prefetch of skb
   * make copybreak a module parameter

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
index 8e287e7..a156119 100644
--- a/drivers/net/chelsio/sge.c
+++ b/drivers/net/chelsio/sge.c
@@ -71,12 +71,9 @@
 #define SGE_FREEL_REFILL_THRESH	16
 #define SGE_RESPQ_E_N		1024
 #define SGE_INTRTIMER_NRES	1000
-#define SGE_RX_COPY_THRES	256
 #define SGE_RX_SM_BUF_SIZE	1536
 #define SGE_TX_DESC_MAX_PLEN	16384
 
-# define SGE_RX_DROP_THRES 2
-
 #define SGE_RESPQ_REPLENISH_THRES (SGE_RESPQ_E_N / 4)
 
 /*
@@ -846,6 +843,8 @@
 		skb_reserve(skb, q->dma_offset);
 		mapping = pci_map_single(pdev, skb->data, dma_len,
 					 PCI_DMA_FROMDEVICE);
+		skb_reserve(skb, sge->rx_pkt_pad);
+
 		ce->skb = skb;
 		pci_unmap_addr_set(ce, dma_addr, mapping);
 		pci_unmap_len_set(ce, dma_len, dma_len);
@@ -1024,6 +1023,10 @@
 	}
 }
 
+static int copybreak __read_mostly = 256;
+module_param(copybreak, int, 0);
+MODULE_PARM_DESC(copybreak, "Receive copy threshold");
+
 /**
  *	get_packet - return the next ingress packet buffer
  *	@pdev: the PCI device that received the packet
@@ -1043,45 +1046,42 @@
  *	be copied but there is no memory for the copy.
  */
 static inline struct sk_buff *get_packet(struct pci_dev *pdev,
-					 struct freelQ *fl, unsigned int len,
-					 int dma_pad, int skb_pad,
-					 unsigned int copy_thres,
-					 unsigned int drop_thres)
+					 struct freelQ *fl, unsigned int len)
 {
 	struct sk_buff *skb;
-	struct freelQ_ce *ce = &fl->centries[fl->cidx];
+	const struct freelQ_ce *ce = &fl->centries[fl->cidx];
 
-	if (len < copy_thres) {
-		skb = alloc_skb(len + skb_pad, GFP_ATOMIC);
-		if (likely(skb != NULL)) {
-			skb_reserve(skb, skb_pad);
-			skb_put(skb, len);
-			pci_dma_sync_single_for_cpu(pdev,
-					    pci_unmap_addr(ce, dma_addr),
-					    pci_unmap_len(ce, dma_len),
-					    PCI_DMA_FROMDEVICE);
-			memcpy(skb->data, ce->skb->data + dma_pad, len);
-			pci_dma_sync_single_for_device(pdev,
-					    pci_unmap_addr(ce, dma_addr),
-					    pci_unmap_len(ce, dma_len),
-					    PCI_DMA_FROMDEVICE);
-		} else if (!drop_thres)
+	if (len < copybreak) {
+		skb = alloc_skb(len + 2, GFP_ATOMIC);
+		if (!skb)
 			goto use_orig_buf;
 
+		skb_reserve(skb, 2);	/* align IP header */
+		skb_put(skb, len);
+		pci_dma_sync_single_for_cpu(pdev,
+					    pci_unmap_addr(ce, dma_addr),
+					    pci_unmap_len(ce, dma_len),
+					    PCI_DMA_FROMDEVICE);
+		memcpy(skb->data, ce->skb->data, len);
+		pci_dma_sync_single_for_device(pdev,
+					       pci_unmap_addr(ce, dma_addr),
+					       pci_unmap_len(ce, dma_len),
+					       PCI_DMA_FROMDEVICE);
 		recycle_fl_buf(fl, fl->cidx);
 		return skb;
 	}
 
-	if (fl->credits < drop_thres) {
+use_orig_buf:
+	if (fl->credits < 2) {
 		recycle_fl_buf(fl, fl->cidx);
 		return NULL;
 	}
 
-use_orig_buf:
 	pci_unmap_single(pdev, pci_unmap_addr(ce, dma_addr),
 			 pci_unmap_len(ce, dma_len), PCI_DMA_FROMDEVICE);
 	skb = ce->skb;
-	skb_reserve(skb, dma_pad);
+	prefetch(skb->data);
+
 	skb_put(skb, len);
 	return skb;
 }
@@ -1359,27 +1359,25 @@
  *
  *	Process an ingress ethernet pakcet and deliver it to the stack.
  */
-static int sge_rx(struct sge *sge, struct freelQ *fl, unsigned int len)
+static void sge_rx(struct sge *sge, struct freelQ *fl, unsigned int len)
 {
 	struct sk_buff *skb;
-	struct cpl_rx_pkt *p;
+	const struct cpl_rx_pkt *p;
 	struct adapter *adapter = sge->adapter;
 	struct sge_port_stats *st;
 
-	skb = get_packet(adapter->pdev, fl, len - sge->rx_pkt_pad,
-			 sge->rx_pkt_pad, 2, SGE_RX_COPY_THRES,
-			 SGE_RX_DROP_THRES);
+	skb = get_packet(adapter->pdev, fl, len - sge->rx_pkt_pad);
 	if (unlikely(!skb)) {
 		sge->stats.rx_drops++;
-		return 0;
+		return;
 	}
 
-	p = (struct cpl_rx_pkt *)skb->data;
-	skb_pull(skb, sizeof(*p));
+	p = (const struct cpl_rx_pkt *) skb->data;
 	if (p->iff >= adapter->params.nports) {
 		kfree_skb(skb);
-		return 0;
+		return;
 	}
+	__skb_pull(skb, sizeof(*p));
 
 	skb->dev = adapter->port[p->iff].dev;
 	skb->dev->last_rx = jiffies;
@@ -1411,7 +1409,6 @@
 		netif_rx(skb);
 #endif
 	}
-	return 0;
 }
 
 /*
@@ -1493,12 +1490,11 @@
 	struct sge *sge = adapter->sge;
 	struct respQ *q = &sge->respQ;
 	struct respQ_e *e = &q->entries[q->cidx];
-	int budget_left = budget;
+	int done = 0;
 	unsigned int flags = 0;
 	unsigned int cmdq_processed[SGE_CMDQ_N] = {0, 0};
 
-
-	while (likely(budget_left && e->GenerationBit == q->genbit)) {
+	while (done < budget && e->GenerationBit == q->genbit) {
 		flags |= e->Qsleeping;
 
 		cmdq_processed[0] += e->Cmdq0CreditReturn;
@@ -1508,14 +1504,16 @@
 		 * ping-pong of TX state information on MP where the sender
 		 * might run on a different CPU than this function...
 		 */
-		if (unlikely(flags & F_CMDQ0_ENABLE || cmdq_processed[0] > 64)) {
+		if (unlikely((flags & F_CMDQ0_ENABLE) || cmdq_processed[0] > 64)) {
 			flags = update_tx_info(adapter, flags, cmdq_processed[0]);
 			cmdq_processed[0] = 0;
 		}
+
 		if (unlikely(cmdq_processed[1] > 16)) {
 			sge->cmdQ[1].processed += cmdq_processed[1];
 			cmdq_processed[1] = 0;
 		}
+
 		if (likely(e->DataValid)) {
 			struct freelQ *fl = &sge->freelQ[e->FreelistQid];
 
@@ -1525,12 +1523,16 @@
 			else
 				sge_rx(sge, fl, e->BufferLength);
 
+			++done;
+
 			/*
 			 * Note: this depends on each packet consuming a
 			 * single free-list buffer; cf. the BUG above.
 			 */
 			if (++fl->cidx == fl->size)
 				fl->cidx = 0;
+			prefetch(fl->centries[fl->cidx].skb);
+
 			if (unlikely(--fl->credits <
 				     fl->size - SGE_FREEL_REFILL_THRESH))
 				refill_free_list(sge, fl);
@@ -1549,14 +1551,12 @@
 			writel(q->credits, adapter->regs + A_SG_RSPQUEUECREDIT);
 			q->credits = 0;
 		}
-		--budget_left;
 	}
 
 	flags = update_tx_info(adapter, flags, cmdq_processed[0]);
 	sge->cmdQ[1].processed += cmdq_processed[1];
 
-	budget -= budget_left;
-	return budget;
+	return done;
 }
 
 static inline int responses_pending(const struct adapter *adapter)
@@ -1581,11 +1581,14 @@
 	struct sge *sge = adapter->sge;
 	struct respQ *q = &sge->respQ;
 	struct respQ_e *e = &q->entries[q->cidx];
+	const struct freelQ *fl = &sge->freelQ[e->FreelistQid];
 	unsigned int flags = 0;
 	unsigned int cmdq_processed[SGE_CMDQ_N] = {0, 0};
 
+	prefetch(fl->centries[fl->cidx].skb);
 	if (e->DataValid)
 		return 1;
+
 	do {
 		flags |= e->Qsleeping;