crypto: blowfish-x86_64 - improve x86_64 blowfish 4-way performance
This patch adds improved F-macro for 4-way parallel functions. With new
F-macro for 4-way parallel functions, blowfish sees ~15% improvement in
speed tests on AMD Phenom II (~5% on Intel Xeon E7330).
However when used in 1-way blowfish function new macro would be ~10%
slower than original, so old F-macro is kept for 1-way functions.
Patch cleans up old F-macro as it is no longer needed in 4-way part.
Patch also does register macro renaming to reduce stack usage.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 44eb23a..391d245 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -56,38 +56,32 @@
#define RT0 %rbp
#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
#define RT0d %ebp
#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
-#define RK0 %r8
-#define RK1 %r9
-#define RK2 %r10
-#define RK3 %r11
-
-#define RK0d %r8d
-#define RK1d %r9d
-#define RK2d %r10d
-#define RK3d %r11d
-
-#define RKEY %r12
+#define RKEY %r10
/***********************************************************************
* 1-way blowfish
***********************************************************************/
-#define F(x, k) \
- rorq $16, x; \
- movzbl x ## bh, RT0d; \
- movzbl x ## bl, RT1d; \
- rolq $16, x; \
- movl s0(CTX,RT0,4), k ## d; \
- addl s1(CTX,RT1,4), k ## d; \
- movzbl x ## bh, RT0d; \
- movzbl x ## bl, RT1d; \
- rolq $32, x; \
- xorl s2(CTX,RT0,4), k ## d; \
- addl s3(CTX,RT1,4), k ## d; \
- xorq k, x;
+#define F() \
+ rorq $16, RX0; \
+ movzbl RX0bh, RT0d; \
+ movzbl RX0bl, RT1d; \
+ rolq $16, RX0; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT1,4), RT0d; \
+ movzbl RX0bh, RT1d; \
+ movzbl RX0bl, RT2d; \
+ rolq $32, RX0; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT2,4), RT0d; \
+ xorq RT0, RX0;
#define add_roundkey_enc(n) \
xorq p+4*(n)(CTX), RX0;
@@ -95,11 +89,8 @@
#define round_enc(n) \
add_roundkey_enc(n); \
\
- F(RX0, RK0); \
- F(RX0, RK0);
-
-#define round_final_enc(n) \
- xorq p+4*(n)(CTX), RX0;
+ F(); \
+ F();
#define add_roundkey_dec(n) \
movq p+4*(n-1)(CTX), RT0; \
@@ -109,8 +100,8 @@
#define round_dec(n) \
add_roundkey_dec(n); \
\
- F(RX0, RK0); \
- F(RX0, RK0); \
+ F(); \
+ F(); \
#define read_block() \
movq (RIO), RX0; \
@@ -130,16 +121,15 @@
.type __blowfish_enc_blk,@function;
__blowfish_enc_blk:
- // input:
- // %rdi: ctx, CTX
- // %rsi: dst
- // %rdx: src
- // %rcx: bool xor
- pushq %rbp;
- pushq %rbx;
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+ movq %rbp, %r11;
- pushq %rsi;
- pushq %rcx;
+ movq %rsi, %r10;
movq %rdx, RIO;
read_block();
@@ -154,38 +144,31 @@
round_enc(14);
add_roundkey_enc(16);
- popq %rbp;
- popq RIO;
+ movq %r11, %rbp;
- test %bpl, %bpl;
+ movq %r10, RIO;
+ test %cl, %cl;
jnz __enc_xor;
write_block();
-
-__enc_ret:
- popq %rbx;
- popq %rbp;
-
ret;
-
__enc_xor:
xor_block();
-
- jmp __enc_ret;
+ ret;
.align 8
.global blowfish_dec_blk
.type blowfish_dec_blk,@function;
blowfish_dec_blk:
- // input:
- // %rdi: ctx, CTX
- // %rsi: dst
- // %rdx: src
- pushq %rbp;
- pushq %rbx;
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ movq %rbp, %r11;
- pushq %rsi;
+ movq %rsi, %r10;
movq %rdx, RIO;
read_block();
@@ -200,17 +183,33 @@
round_dec(3);
add_roundkey_dec(1);
- popq RIO;
+ movq %r10, RIO;
write_block();
- popq %rbx;
- popq %rbp;
+ movq %r11, %rbp;
ret;
/**********************************************************************
4-way blowfish, four blocks parallel
**********************************************************************/
+
+/* F() for 4-way. Slower when used alone/1-way, but faster when used
+ * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
+ */
+#define F4(x) \
+ movzbl x ## bh, RT1d; \
+ movzbl x ## bl, RT3d; \
+ rorq $16, x; \
+ movzbl x ## bh, RT0d; \
+ movzbl x ## bl, RT2d; \
+ rorq $16, x; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT2,4), RT0d; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT3,4), RT0d; \
+ xorq RT0, x;
+
#define add_preloaded_roundkey4() \
xorq RKEY, RX0; \
xorq RKEY, RX1; \
@@ -227,15 +226,15 @@
#define round_enc4(n) \
add_roundkey_enc4(n); \
\
- F(RX0, RK0); \
- F(RX1, RK1); \
- F(RX2, RK2); \
- F(RX3, RK3); \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
\
- F(RX0, RK0); \
- F(RX1, RK1); \
- F(RX2, RK2); \
- F(RX3, RK3);
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
#define preload_roundkey_dec(n) \
movq p+4*((n)-1)(CTX), RKEY; \
@@ -248,15 +247,15 @@
#define round_dec4(n) \
add_roundkey_dec4(n); \
\
- F(RX0, RK0); \
- F(RX1, RK1); \
- F(RX2, RK2); \
- F(RX3, RK3); \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
\
- F(RX0, RK0); \
- F(RX1, RK1); \
- F(RX2, RK2); \
- F(RX3, RK3);
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
#define read_block4() \
movq (RIO), RX0; \
@@ -306,18 +305,19 @@
.type __blowfish_enc_blk_4way,@function;
__blowfish_enc_blk_4way:
- // input:
- // %rdi: ctx, CTX
- // %rsi: dst
- // %rdx: src
- // %rcx: bool xor
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
pushq %rbp;
pushq %rbx;
- pushq RKEY;
+ pushq %rcx;
+
preload_roundkey_enc(0);
- pushq %rsi;
- pushq %rcx;
+ movq %rsi, %r11;
movq %rdx, RIO;
read_block4();
@@ -333,40 +333,39 @@
add_preloaded_roundkey4();
popq %rbp;
- popq RIO;
+ movq %r11, RIO;
test %bpl, %bpl;
jnz __enc_xor4;
write_block4();
-__enc_ret4:
- popq RKEY;
popq %rbx;
popq %rbp;
-
ret;
__enc_xor4:
xor_block4();
- jmp __enc_ret4;
+ popq %rbx;
+ popq %rbp;
+ ret;
.align 8
.global blowfish_dec_blk_4way
.type blowfish_dec_blk_4way,@function;
blowfish_dec_blk_4way:
- // input:
- // %rdi: ctx, CTX
- // %rsi: dst
- // %rdx: src
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
pushq %rbp;
pushq %rbx;
- pushq RKEY;
preload_roundkey_dec(17);
- pushq %rsi;
+ movq %rsi, %r11;
movq %rdx, RIO;
read_block4();
@@ -381,10 +380,9 @@
round_dec4(3);
add_preloaded_roundkey4();
- popq RIO;
+ movq %r11, RIO;
write_block4();
- popq RKEY;
popq %rbx;
popq %rbp;