sfc: Use write-combining to reduce TX latency

Based on work by Neil Turton <nturton@solarflare.com> and
Kieran Mansley <kmansley@solarflare.com>.

The BIU has now been verified to handle 3- and 4-dword writes within a
single 128-bit register correctly.  This means we can enable write-
combining and only insert write barriers between writes to distinct
registers.

This has been observed to save about 0.5 us when pushing a TX
descriptor to an empty TX queue.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
diff --git a/drivers/net/sfc/io.h b/drivers/net/sfc/io.h
index dc45110..d9d8c2e 100644
--- a/drivers/net/sfc/io.h
+++ b/drivers/net/sfc/io.h
@@ -48,9 +48,9 @@
  *   replacing the low 96 bits with zero does not affect functionality.
  * - If the host writes to the last dword address of such a register
  *   (i.e. the high 32 bits) the underlying register will always be
- *   written.  If the collector does not hold values for the low 96
- *   bits of the register, they will be written as zero.  Writing to
- *   the last qword does not have this effect and must not be done.
+ *   written.  If the collector and the current write together do not
+ *   provide values for all 128 bits of the register, the low 96 bits
+ *   will be written as zero.
  * - If the host writes to the address of any other part of such a
  *   register while the collector already holds values for some other
  *   register, the write is discarded and the collector maintains its
@@ -103,6 +103,7 @@
 	_efx_writed(efx, value->u32[2], reg + 8);
 	_efx_writed(efx, value->u32[3], reg + 12);
 #endif
+	wmb();
 	mmiowb();
 	spin_unlock_irqrestore(&efx->biu_lock, flags);
 }
@@ -125,6 +126,7 @@
 	__raw_writel((__force u32)value->u32[0], membase + addr);
 	__raw_writel((__force u32)value->u32[1], membase + addr + 4);
 #endif
+	wmb();
 	mmiowb();
 	spin_unlock_irqrestore(&efx->biu_lock, flags);
 }
@@ -139,6 +141,7 @@
 
 	/* No lock required */
 	_efx_writed(efx, value->u32[0], reg);
+	wmb();
 }
 
 /* Read a 128-bit CSR, locking as appropriate. */
@@ -237,12 +240,14 @@
 
 #ifdef EFX_USE_QWORD_IO
 	_efx_writeq(efx, value->u64[0], reg + 0);
+	_efx_writeq(efx, value->u64[1], reg + 8);
 #else
 	_efx_writed(efx, value->u32[0], reg + 0);
 	_efx_writed(efx, value->u32[1], reg + 4);
-#endif
 	_efx_writed(efx, value->u32[2], reg + 8);
 	_efx_writed(efx, value->u32[3], reg + 12);
+#endif
+	wmb();
 }
 #define efx_writeo_page(efx, value, reg, page)				\
 	_efx_writeo_page(efx, value,					\