Blame - arch/x86/crypto/sha1_avx2_x86_64_asm.S - linux

blob: 9f712a7dfd797cc499e1ef52ba7999078530494a [file] [log] [blame]

chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	1	/*
				2	* Implement fast SHA-1 with AVX2 instructions. (x86_64)
				3	*
				4	* This file is provided under a dual BSD/GPLv2 license. When using or
				5	* redistributing this file, you may do so under either license.
				6	*
				7	* GPL LICENSE SUMMARY
				8	*
				9	* Copyright(c) 2014 Intel Corporation.
				10	*
				11	* This program is free software; you can redistribute it and/or modify
				12	* it under the terms of version 2 of the GNU General Public License as
				13	* published by the Free Software Foundation.
				14	*
				15	* This program is distributed in the hope that it will be useful, but
				16	* WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* Contact Information:
				21	* Ilya Albrekht <ilya.albrekht@intel.com>
				22	* Maxim Locktyukhin <maxim.locktyukhin@intel.com>
				23	* Ronen Zohar <ronen.zohar@intel.com>
				24	* Chandramouli Narayanan <mouli@linux.intel.com>
				25	*
				26	* BSD LICENSE
				27	*
				28	* Copyright(c) 2014 Intel Corporation.
				29	*
				30	* Redistribution and use in source and binary forms, with or without
				31	* modification, are permitted provided that the following conditions
				32	* are met:
				33	*
				34	* Redistributions of source code must retain the above copyright
				35	* notice, this list of conditions and the following disclaimer.
				36	* Redistributions in binary form must reproduce the above copyright
				37	* notice, this list of conditions and the following disclaimer in
				38	* the documentation and/or other materials provided with the
				39	* distribution.
				40	* Neither the name of Intel Corporation nor the names of its
				41	* contributors may be used to endorse or promote products derived
				42	* from this software without specific prior written permission.
				43	*
				44	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				45	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				46	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				47	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				48	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				49	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				50	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				51	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				52	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				53	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				54	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				55	*
				56	*/
				57
				58	/*
				59	* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
				60	*
				61	*This implementation is based on the previous SSSE3 release:
				62	*Visit http://software.intel.com/en-us/articles/
				63	*and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
				64	*
				65	*Updates 20-byte SHA-1 record in 'hash' for even number of
				66	*'num_blocks' consecutive 64-byte blocks
				67	*
				68	*extern "C" void sha1_transform_avx2(
				69	* int hash, const char input, size_t num_blocks );
				70	*/
				71
				72	#include <linux/linkage.h>
				73
				74	#define CTX %rdi /* arg1 */
				75	#define BUF %rsi /* arg2 */
				76	#define CNT %rdx /* arg3 */
				77
				78	#define REG_A %ecx
				79	#define REG_B %esi
				80	#define REG_C %edi
				81	#define REG_D %eax
				82	#define REG_E %edx
				83	#define REG_TB %ebx
				84	#define REG_TA %r12d
				85	#define REG_RA %rcx
				86	#define REG_RB %rsi
				87	#define REG_RC %rdi
				88	#define REG_RD %rax
				89	#define REG_RE %rdx
				90	#define REG_RTA %r12
				91	#define REG_RTB %rbx
Josh Poimboeuf	d7b1722	2017-09-18 14:42:05 -0500	[diff] [blame]	92	#define REG_T1 %r11d
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	93	#define xmm_mov vmovups
				94	#define avx2_zeroupper vzeroupper
				95	#define RND_F1 1
				96	#define RND_F2 2
				97	#define RND_F3 3
				98
				99	.macro REGALLOC
				100	.set A, REG_A
				101	.set B, REG_B
				102	.set C, REG_C
				103	.set D, REG_D
				104	.set E, REG_E
				105	.set TB, REG_TB
				106	.set TA, REG_TA
				107
				108	.set RA, REG_RA
				109	.set RB, REG_RB
				110	.set RC, REG_RC
				111	.set RD, REG_RD
				112	.set RE, REG_RE
				113
				114	.set RTA, REG_RTA
				115	.set RTB, REG_RTB
				116
				117	.set T1, REG_T1
				118	.endm
				119
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	120	#define HASH_PTR %r9
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	121	#define BLOCKS_CTR %r8
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	122	#define BUFFER_PTR %r10
				123	#define BUFFER_PTR2 %r13
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	124
				125	#define PRECALC_BUF %r14
				126	#define WK_BUF %r15
				127
				128	#define W_TMP %xmm0
				129	#define WY_TMP %ymm0
				130	#define WY_TMP2 %ymm9
				131
				132	# AVX2 variables
				133	#define WY0 %ymm3
				134	#define WY4 %ymm5
				135	#define WY08 %ymm7
				136	#define WY12 %ymm8
				137	#define WY16 %ymm12
				138	#define WY20 %ymm13
				139	#define WY24 %ymm14
				140	#define WY28 %ymm15
				141
				142	#define YMM_SHUFB_BSWAP %ymm10
				143
				144	/*
				145	* Keep 2 iterations precalculated at a time:
				146	* - 80 DWORDs per iteration * 2
				147	*/
				148	#define W_SIZE (8022 +16)
				149
				150	#define WK(t) ((((t) % 80) / 4)32 + ( (t) % 4)4 + ((t)/80)*16 )(WK_BUF)
				151	#define PRECALC_WK(t) ((t)22)(PRECALC_BUF)
				152
				153
				154	.macro UPDATE_HASH hash, val
				155	add \hash, \val
				156	mov \val, \hash
				157	.endm
				158
				159	.macro PRECALC_RESET_WY
				160	.set WY_00, WY0
				161	.set WY_04, WY4
				162	.set WY_08, WY08
				163	.set WY_12, WY12
				164	.set WY_16, WY16
				165	.set WY_20, WY20
				166	.set WY_24, WY24
				167	.set WY_28, WY28
				168	.set WY_32, WY_00
				169	.endm
				170
				171	.macro PRECALC_ROTATE_WY
				172	/* Rotate macros */
				173	.set WY_32, WY_28
				174	.set WY_28, WY_24
				175	.set WY_24, WY_20
				176	.set WY_20, WY_16
				177	.set WY_16, WY_12
				178	.set WY_12, WY_08
				179	.set WY_08, WY_04
				180	.set WY_04, WY_00
				181	.set WY_00, WY_32
				182
				183	/* Define register aliases */
				184	.set WY, WY_00
				185	.set WY_minus_04, WY_04
				186	.set WY_minus_08, WY_08
				187	.set WY_minus_12, WY_12
				188	.set WY_minus_16, WY_16
				189	.set WY_minus_20, WY_20
				190	.set WY_minus_24, WY_24
				191	.set WY_minus_28, WY_28
				192	.set WY_minus_32, WY
				193	.endm
				194
				195	.macro PRECALC_00_15
				196	.if (i == 0) # Initialize and rotate registers
				197	PRECALC_RESET_WY
				198	PRECALC_ROTATE_WY
				199	.endif
				200
				201	/* message scheduling pre-compute for rounds 0-15 */
				202	.if ((i & 7) == 0)
				203	/*
				204	* blended AVX2 and ALU instruction scheduling
				205	* 1 vector iteration per 8 rounds
				206	*/
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	207	vmovdqu (i * 2)(BUFFER_PTR), W_TMP
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	208	.elseif ((i & 7) == 1)
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	209	vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	210	WY_TMP, WY_TMP
				211	.elseif ((i & 7) == 2)
				212	vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
				213	.elseif ((i & 7) == 4)
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	214	vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	215	.elseif ((i & 7) == 7)
				216	vmovdqu WY_TMP, PRECALC_WK(i&~7)
				217
				218	PRECALC_ROTATE_WY
				219	.endif
				220	.endm
				221
				222	.macro PRECALC_16_31
				223	/*
				224	* message scheduling pre-compute for rounds 16-31
				225	* calculating last 32 w[i] values in 8 XMM registers
				226	* pre-calculate K+w[i] values and store to mem
				227	* for later load by ALU add instruction
				228	*
				229	* "brute force" vectorization for rounds 16-31 only
				230	* due to w[i]->w[i-3] dependency
				231	*/
				232	.if ((i & 7) == 0)
				233	/*
				234	* blended AVX2 and ALU instruction scheduling
				235	* 1 vector iteration per 8 rounds
				236	*/
				237	/* w[i-14] */
				238	vpalignr $8, WY_minus_16, WY_minus_12, WY
				239	vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
				240	.elseif ((i & 7) == 1)
				241	vpxor WY_minus_08, WY, WY
				242	vpxor WY_minus_16, WY_TMP, WY_TMP
				243	.elseif ((i & 7) == 2)
				244	vpxor WY_TMP, WY, WY
				245	vpslldq $12, WY, WY_TMP2
				246	.elseif ((i & 7) == 3)
				247	vpslld $1, WY, WY_TMP
				248	vpsrld $31, WY, WY
				249	.elseif ((i & 7) == 4)
				250	vpor WY, WY_TMP, WY_TMP
				251	vpslld $2, WY_TMP2, WY
				252	.elseif ((i & 7) == 5)
				253	vpsrld $30, WY_TMP2, WY_TMP2
				254	vpxor WY, WY_TMP, WY_TMP
				255	.elseif ((i & 7) == 7)
				256	vpxor WY_TMP2, WY_TMP, WY
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	257	vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	258	vmovdqu WY_TMP, PRECALC_WK(i&~7)
				259
				260	PRECALC_ROTATE_WY
				261	.endif
				262	.endm
				263
				264	.macro PRECALC_32_79
				265	/*
				266	* in SHA-1 specification:
				267	* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
				268	* instead we do equal:
				269	* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
				270	* allows more efficient vectorization
				271	* since w[i]=>w[i-3] dependency is broken
				272	*/
				273
				274	.if ((i & 7) == 0)
				275	/*
				276	* blended AVX2 and ALU instruction scheduling
				277	* 1 vector iteration per 8 rounds
				278	*/
				279	vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
				280	.elseif ((i & 7) == 1)
				281	/* W is W_minus_32 before xor */
				282	vpxor WY_minus_28, WY, WY
				283	.elseif ((i & 7) == 2)
				284	vpxor WY_minus_16, WY_TMP, WY_TMP
				285	.elseif ((i & 7) == 3)
				286	vpxor WY_TMP, WY, WY
				287	.elseif ((i & 7) == 4)
				288	vpslld $2, WY, WY_TMP
				289	.elseif ((i & 7) == 5)
				290	vpsrld $30, WY, WY
				291	vpor WY, WY_TMP, WY
				292	.elseif ((i & 7) == 7)
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	293	vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	294	vmovdqu WY_TMP, PRECALC_WK(i&~7)
				295
				296	PRECALC_ROTATE_WY
				297	.endif
				298	.endm
				299
				300	.macro PRECALC r, s
				301	.set i, \r
				302
				303	.if (i < 40)
				304	.set K_XMM, 32*0
				305	.elseif (i < 80)
				306	.set K_XMM, 32*1
				307	.elseif (i < 120)
				308	.set K_XMM, 32*2
				309	.else
				310	.set K_XMM, 32*3
				311	.endif
				312
				313	.if (i<32)
				314	PRECALC_00_15 \s
				315	.elseif (i<64)
				316	PRECALC_16_31 \s
				317	.elseif (i < 160)
				318	PRECALC_32_79 \s
				319	.endif
				320	.endm
				321
				322	.macro ROTATE_STATE
				323	.set T_REG, E
				324	.set E, D
				325	.set D, C
				326	.set C, B
				327	.set B, TB
				328	.set TB, A
				329	.set A, T_REG
				330
				331	.set T_REG, RE
				332	.set RE, RD
				333	.set RD, RC
				334	.set RC, RB
				335	.set RB, RTB
				336	.set RTB, RA
				337	.set RA, T_REG
				338	.endm
				339
				340	/* Macro relies on saved ROUND_Fx */
				341
				342	.macro RND_FUN f, r
				343	.if (\f == RND_F1)
				344	ROUND_F1 \r
				345	.elseif (\f == RND_F2)
				346	ROUND_F2 \r
				347	.elseif (\f == RND_F3)
				348	ROUND_F3 \r
				349	.endif
				350	.endm
				351
				352	.macro RR r
				353	.set round_id, (\r % 80)
				354
				355	.if (round_id == 0) /* Precalculate F for first round */
				356	.set ROUND_FUNC, RND_F1
				357	mov B, TB
				358
				359	rorx $(32-30), B, B /* b>>>2 */
				360	andn D, TB, T1
				361	and C, TB
				362	xor T1, TB
				363	.endif
				364
				365	RND_FUN ROUND_FUNC, \r
				366	ROTATE_STATE
				367
				368	.if (round_id == 18)
				369	.set ROUND_FUNC, RND_F2
				370	.elseif (round_id == 38)
				371	.set ROUND_FUNC, RND_F3
				372	.elseif (round_id == 58)
				373	.set ROUND_FUNC, RND_F2
				374	.endif
				375
				376	.set round_id, ( (\r+1) % 80)
				377
				378	RND_FUN ROUND_FUNC, (\r+1)
				379	ROTATE_STATE
				380	.endm
				381
				382	.macro ROUND_F1 r
				383	add WK(\r), E
				384
				385	andn C, A, T1 /* ~b&d */
				386	lea (RE,RTB), E /* Add F from the previous round */
				387
				388	rorx $(32-5), A, TA /* T2 = A >>> 5 */
				389	rorx $(32-30),A, TB /* b>>>2 for next round */
				390
				391	PRECALC (\r) /* msg scheduling for next 2 blocks */
				392
				393	/*
				394	* Calculate F for the next round
				395	* (b & c) ^ andn[b, d]
				396	*/
				397	and B, A /* b&c */
				398	xor T1, A /* F1 = (b&c) ^ (~b&d) */
				399
				400	lea (RE,RTA), E /* E += A >>> 5 */
				401	.endm
				402
				403	.macro ROUND_F2 r
				404	add WK(\r), E
				405	lea (RE,RTB), E /* Add F from the previous round */
				406
				407	/* Calculate F for the next round */
				408	rorx $(32-5), A, TA /* T2 = A >>> 5 */
				409	.if ((round_id) < 79)
				410	rorx $(32-30), A, TB /* b>>>2 for next round */
				411	.endif
				412	PRECALC (\r) /* msg scheduling for next 2 blocks */
				413
				414	.if ((round_id) < 79)
				415	xor B, A
				416	.endif
				417
				418	add TA, E /* E += A >>> 5 */
				419
				420	.if ((round_id) < 79)
				421	xor C, A
				422	.endif
				423	.endm
				424
				425	.macro ROUND_F3 r
				426	add WK(\r), E
				427	PRECALC (\r) /* msg scheduling for next 2 blocks */
				428
				429	lea (RE,RTB), E /* Add F from the previous round */
				430
				431	mov B, T1
				432	or A, T1
				433
				434	rorx $(32-5), A, TA /* T2 = A >>> 5 */
				435	rorx $(32-30), A, TB /* b>>>2 for next round */
				436
				437	/* Calculate F for the next round
				438	* (b and c) or (d and (b or c))
				439	*/
				440	and C, T1
				441	and B, A
				442	or T1, A
				443
				444	add TA, E /* E += A >>> 5 */
				445
				446	.endm
				447
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	448	/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
				449	* %1 + %2 >= %3 ? %4 : 0
				450	*/
				451	.macro ADD_IF_GE a, b, c, d
				452	mov \a, RTA
				453	add $\d, RTA
				454	cmp $\c, \b
				455	cmovge RTA, \a
				456	.endm
				457
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	458	/*
				459	* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
				460	*/
				461	.macro SHA1_PIPELINED_MAIN_BODY
				462
				463	REGALLOC
				464
				465	mov (HASH_PTR), A
				466	mov 4(HASH_PTR), B
				467	mov 8(HASH_PTR), C
				468	mov 12(HASH_PTR), D
				469	mov 16(HASH_PTR), E
				470
				471	mov %rsp, PRECALC_BUF
				472	lea (2480+32)(%rsp), WK_BUF
				473
				474	# Precalc WK for first 2 blocks
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	475	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	476	.set i, 0
				477	.rept 160
				478	PRECALC i
				479	.set i, i + 1
				480	.endr
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	481
				482	/* Go to next block if needed */
				483	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
				484	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	485	xchg WK_BUF, PRECALC_BUF
				486
				487	.align 32
				488	_loop:
				489	/*
				490	* code loops through more than one block
				491	* we use K_BASE value as a signal of a last block,
				492	* it is set below by: cmovae BUFFER_PTR, K_BASE
				493	*/
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	494	test BLOCKS_CTR, BLOCKS_CTR
				495	jnz _begin
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	496	.align 32
				497	jmp _end
				498	.align 32
				499	_begin:
				500
				501	/*
				502	* Do first block
				503	* rounds: 0,2,4,6,8
				504	*/
				505	.set j, 0
				506	.rept 5
				507	RR j
				508	.set j, j+2
				509	.endr
				510
				511	jmp _loop0
				512	_loop0:
				513
				514	/*
				515	* rounds:
				516	* 10,12,14,16,18
				517	* 20,22,24,26,28
				518	* 30,32,34,36,38
				519	* 40,42,44,46,48
				520	* 50,52,54,56,58
				521	*/
				522	.rept 25
				523	RR j
				524	.set j, j+2
				525	.endr
				526
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	527	/* Update Counter */
				528	sub $1, BLOCKS_CTR
				529	/* Move to the next block only if needed*/
				530	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	531	/*
				532	* rounds
				533	* 60,62,64,66,68
				534	* 70,72,74,76,78
				535	*/
				536	.rept 10
				537	RR j
				538	.set j, j+2
				539	.endr
				540
				541	UPDATE_HASH (HASH_PTR), A
				542	UPDATE_HASH 4(HASH_PTR), TB
				543	UPDATE_HASH 8(HASH_PTR), C
				544	UPDATE_HASH 12(HASH_PTR), D
				545	UPDATE_HASH 16(HASH_PTR), E
				546
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	547	test BLOCKS_CTR, BLOCKS_CTR
				548	jz _loop
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	549
				550	mov TB, B
				551
				552	/* Process second block */
				553	/*
				554	* rounds
				555	* 0+80, 2+80, 4+80, 6+80, 8+80
				556	* 10+80,12+80,14+80,16+80,18+80
				557	*/
				558
				559	.set j, 0
				560	.rept 10
				561	RR j+80
				562	.set j, j+2
				563	.endr
				564
				565	jmp _loop1
				566	_loop1:
				567	/*
				568	* rounds
				569	* 20+80,22+80,24+80,26+80,28+80
				570	* 30+80,32+80,34+80,36+80,38+80
				571	*/
				572	.rept 10
				573	RR j+80
				574	.set j, j+2
				575	.endr
				576
				577	jmp _loop2
				578	_loop2:
				579
				580	/*
				581	* rounds
				582	* 40+80,42+80,44+80,46+80,48+80
				583	* 50+80,52+80,54+80,56+80,58+80
				584	*/
				585	.rept 10
				586	RR j+80
				587	.set j, j+2
				588	.endr
				589
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	590	/* update counter */
				591	sub $1, BLOCKS_CTR
				592	/* Move to the next block only if needed*/
				593	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	594
				595	jmp _loop3
				596	_loop3:
				597
				598	/*
				599	* rounds
				600	* 60+80,62+80,64+80,66+80,68+80
				601	* 70+80,72+80,74+80,76+80,78+80
				602	*/
				603	.rept 10
				604	RR j+80
				605	.set j, j+2
				606	.endr
				607
				608	UPDATE_HASH (HASH_PTR), A
				609	UPDATE_HASH 4(HASH_PTR), TB
				610	UPDATE_HASH 8(HASH_PTR), C
				611	UPDATE_HASH 12(HASH_PTR), D
				612	UPDATE_HASH 16(HASH_PTR), E
				613
				614	/* Reset state for AVX2 reg permutation */
				615	mov A, TA
				616	mov TB, A
				617	mov C, TB
				618	mov E, C
				619	mov D, B
				620	mov TA, D
				621
				622	REGALLOC
				623
				624	xchg WK_BUF, PRECALC_BUF
				625
				626	jmp _loop
				627
				628	.align 32
				629	_end:
				630
				631	.endm
				632	/*
				633	* macro implements SHA-1 function's body for several 64-byte blocks
				634	* param: function's name
				635	*/
				636	.macro SHA1_VECTOR_ASM name
				637	ENTRY(\name)
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	638
				639	push %rbx
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	640	push %r12
				641	push %r13
				642	push %r14
				643	push %r15
				644
				645	RESERVE_STACK = (W_SIZE*4 + 8+24)
				646
				647	/* Align stack */
				648	mov %rsp, %rbx
Mathias Krause	6c8c17c	2014-03-24 17:10:38 +0100	[diff] [blame]	649	and $~(0x20-1), %rsp
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	650	push %rbx
				651	sub $RESERVE_STACK, %rsp
				652
				653	avx2_zeroupper
				654
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	655	/* Setup initial values */
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	656	mov CTX, HASH_PTR
				657	mov BUF, BUFFER_PTR
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	658
megha.dey@linux.intel.com	8861249	2017-08-02 13:49:09 -0700	[diff] [blame]	659	mov BUF, BUFFER_PTR2
				660	mov CNT, BLOCKS_CTR
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	661
				662	xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
				663
				664	SHA1_PIPELINED_MAIN_BODY
				665
				666	avx2_zeroupper
				667
				668	add $RESERVE_STACK, %rsp
Mathias Krause	6c8c17c	2014-03-24 17:10:38 +0100	[diff] [blame]	669	pop %rsp
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	670
				671	pop %r15
				672	pop %r14
				673	pop %r13
				674	pop %r12
chandramouli narayanan	7c1da8d	2014-03-20 15:14:00 -0700	[diff] [blame]	675	pop %rbx
				676
				677	ret
				678
				679	ENDPROC(\name)
				680	.endm
				681
				682	.section .rodata
				683
				684	#define K1 0x5a827999
				685	#define K2 0x6ed9eba1
				686	#define K3 0x8f1bbcdc
				687	#define K4 0xca62c1d6
				688
				689	.align 128
				690	K_XMM_AR:
				691	.long K1, K1, K1, K1
				692	.long K1, K1, K1, K1
				693	.long K2, K2, K2, K2
				694	.long K2, K2, K2, K2
				695	.long K3, K3, K3, K3
				696	.long K3, K3, K3, K3
				697	.long K4, K4, K4, K4
				698	.long K4, K4, K4, K4
				699
				700	BSWAP_SHUFB_CTL:
				701	.long 0x00010203
				702	.long 0x04050607
				703	.long 0x08090a0b
				704	.long 0x0c0d0e0f
				705	.long 0x00010203
				706	.long 0x04050607
				707	.long 0x08090a0b
				708	.long 0x0c0d0e0f
				709	.text
				710
				711	SHA1_VECTOR_ASM sha1_transform_avx2