Blame - arch/arm/crypto/ghash-ce-core.S - linux

blob: 858c0d66798be109a62fda5cb8c8d5e57725f2c4 [file] [log] [blame]

Thomas Gleixner	d2912cb	2019-06-04 10:11:33 +0200	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0-only */
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	2	/*
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	3	* Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	4	*
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	5	* Copyright (C) 2015 - 2017 Linaro Ltd.
				6	* Copyright (C) 2023 Google LLC. <ardb@google.com>
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	7	*/
				8
				9	#include <linux/linkage.h>
				10	#include <asm/assembler.h>
				11
Stefan Agner	7548bf8	2020-03-02 00:37:14 +0100	[diff] [blame]	12	.arch armv8-a
				13	.fpu crypto-neon-fp-armv8
				14
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	15	SHASH .req q0
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	16	T1 .req q1
				17	XL .req q2
				18	XM .req q3
				19	XH .req q4
				20	IN1 .req q4
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	21
				22	SHASH_L .req d0
				23	SHASH_H .req d1
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	24	T1_L .req d2
				25	T1_H .req d3
				26	XL_L .req d4
				27	XL_H .req d5
				28	XM_L .req d6
				29	XM_H .req d7
				30	XH_L .req d8
				31
				32	t0l .req d10
				33	t0h .req d11
				34	t1l .req d12
				35	t1h .req d13
				36	t2l .req d14
				37	t2h .req d15
				38	t3l .req d16
				39	t3h .req d17
				40	t4l .req d18
				41	t4h .req d19
				42
				43	t0q .req q5
				44	t1q .req q6
				45	t2q .req q7
				46	t3q .req q8
				47	t4q .req q9
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	48	XH2 .req q9
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	49
				50	s1l .req d20
				51	s1h .req d21
				52	s2l .req d22
				53	s2h .req d23
				54	s3l .req d24
				55	s3h .req d25
				56	s4l .req d26
				57	s4h .req d27
				58
				59	MASK .req d28
				60	SHASH2_p8 .req d28
				61
				62	k16 .req d29
				63	k32 .req d30
				64	k48 .req d31
				65	SHASH2_p64 .req d31
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	66
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	67	HH .req q10
				68	HH3 .req q11
				69	HH4 .req q12
				70	HH34 .req q13
				71
				72	HH_L .req d20
				73	HH_H .req d21
				74	HH3_L .req d22
				75	HH3_H .req d23
				76	HH4_L .req d24
				77	HH4_H .req d25
				78	HH34_L .req d26
				79	HH34_H .req d27
				80	SHASH2_H .req d29
				81
				82	XL2 .req q5
				83	XM2 .req q6
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	84	T2 .req q7
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	85	T3 .req q8
				86
				87	XL2_L .req d10
				88	XL2_H .req d11
				89	XM2_L .req d12
				90	XM2_H .req d13
				91	T3_L .req d16
				92	T3_H .req d17
				93
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	94	.text
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	95
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	96	.macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
				97	vmull.p64 \rd, \rn, \rm
				98	.endm
				99
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	100	/*
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	101	* This implementation of 64x64 -> 128 bit polynomial multiplication
				102	* using vmull.p8 instructions (8x8 -> 16) is taken from the paper
				103	* "Fast Software Polynomial Multiplication on ARM Processors Using
				104	* the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
				105	* Ricardo Dahab (https://hal.inria.fr/hal-01506572)
				106	*
				107	* It has been slightly tweaked for in-order performance, and to allow
				108	* 'rq' to overlap with 'ad' or 'bd'.
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	109	*/
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	110	.macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
				111	vext.8 t0l, \ad, \ad, #1 @ A1
				112	.ifc \b1, t4l
				113	vext.8 t4l, \bd, \bd, #1 @ B1
				114	.endif
				115	vmull.p8 t0q, t0l, \bd @ F = A1*B
				116	vext.8 t1l, \ad, \ad, #2 @ A2
				117	vmull.p8 t4q, \ad, \b1 @ E = A*B1
				118	.ifc \b2, t3l
				119	vext.8 t3l, \bd, \bd, #2 @ B2
				120	.endif
				121	vmull.p8 t1q, t1l, \bd @ H = A2*B
				122	vext.8 t2l, \ad, \ad, #3 @ A3
				123	vmull.p8 t3q, \ad, \b2 @ G = A*B2
				124	veor t0q, t0q, t4q @ L = E + F
				125	.ifc \b3, t4l
				126	vext.8 t4l, \bd, \bd, #3 @ B3
				127	.endif
				128	vmull.p8 t2q, t2l, \bd @ J = A3*B
				129	veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
				130	veor t1q, t1q, t3q @ M = G + H
				131	.ifc \b4, t3l
				132	vext.8 t3l, \bd, \bd, #4 @ B4
				133	.endif
				134	vmull.p8 t4q, \ad, \b3 @ I = A*B3
				135	veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
				136	vmull.p8 t3q, \ad, \b4 @ K = A*B4
				137	vand t0h, t0h, k48
				138	vand t1h, t1h, k32
				139	veor t2q, t2q, t4q @ N = I + J
				140	veor t0l, t0l, t0h
				141	veor t1l, t1l, t1h
				142	veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
				143	vand t2h, t2h, k16
				144	veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
				145	vmov.i64 t3h, #0
				146	vext.8 t0q, t0q, t0q, #15
				147	veor t2l, t2l, t2h
				148	vext.8 t1q, t1q, t1q, #14
				149	vmull.p8 \rq, \ad, \bd @ D = A*B
				150	vext.8 t2q, t2q, t2q, #13
				151	vext.8 t3q, t3q, t3q, #12
				152	veor t0q, t0q, t1q
				153	veor t2q, t2q, t3q
				154	veor \rq, \rq, t0q
				155	veor \rq, \rq, t2q
				156	.endm
				157
				158	//
				159	// PMULL (64x64->128) based reduction for CPUs that can do
				160	// it in a single instruction.
				161	//
				162	.macro __pmull_reduce_p64
				163	vmull.p64 T1, XL_L, MASK
				164
				165	veor XH_L, XH_L, XM_H
				166	vext.8 T1, T1, T1, #8
				167	veor XL_H, XL_H, XM_L
				168	veor T1, T1, XL
				169
				170	vmull.p64 XL, T1_H, MASK
				171	.endm
				172
				173	//
				174	// Alternative reduction for CPUs that lack support for the
				175	// 64x64->128 PMULL instruction
				176	//
				177	.macro __pmull_reduce_p8
				178	veor XL_H, XL_H, XM_L
				179	veor XH_L, XH_L, XM_H
				180
				181	vshl.i64 T1, XL, #57
				182	vshl.i64 T2, XL, #62
				183	veor T1, T1, T2
				184	vshl.i64 T2, XL, #63
				185	veor T1, T1, T2
				186	veor XL_H, XL_H, T1_L
				187	veor XH_L, XH_L, T1_H
				188
				189	vshr.u64 T1, XL, #1
				190	veor XH, XH, XL
				191	veor XL, XL, T1
				192	vshr.u64 T1, T1, #6
				193	vshr.u64 XL, XL, #1
				194	.endm
				195
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	196	.macro ghash_update, pn, enc, aggregate=1, head=1
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	197	vld1.64 {XL}, [r1]
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	198
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	199	.if \head
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	200	/* do the head block first, if supplied */
				201	ldr ip, [sp]
				202	teq ip, #0
				203	beq 0f
				204	vld1.64 {T1}, [ip]
				205	teq r0, #0
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	206	b 3f
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	207	.endif
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	208
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	209	0: .ifc \pn, p64
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	210	.if \aggregate
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	211	tst r0, #3 // skip until #blocks is a
				212	bne 2f // round multiple of 4
				213
				214	vld1.8 {XL2-XM2}, [r2]!
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	215	1: vld1.8 {T2-T3}, [r2]!
				216
				217	.ifnb \enc
				218	\enc\()_4x XL2, XM2, T2, T3
				219
				220	add ip, r3, #16
				221	vld1.64 {HH}, [ip, :128]!
				222	vld1.64 {HH3-HH4}, [ip, :128]
				223
				224	veor SHASH2_p64, SHASH_L, SHASH_H
				225	veor SHASH2_H, HH_L, HH_H
				226	veor HH34_L, HH3_L, HH3_H
				227	veor HH34_H, HH4_L, HH4_H
				228
				229	vmov.i8 MASK, #0xe1
				230	vshl.u64 MASK, MASK, #57
				231	.endif
				232
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	233	vrev64.8 XL2, XL2
				234	vrev64.8 XM2, XM2
				235
				236	subs r0, r0, #4
				237
				238	vext.8 T1, XL2, XL2, #8
				239	veor XL2_H, XL2_H, XL_L
				240	veor XL, XL, T1
				241
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	242	vrev64.8 T1, T3
				243	vrev64.8 T3, T2
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	244
				245	vmull.p64 XH, HH4_H, XL_H // a1 * b1
				246	veor XL2_H, XL2_H, XL_H
				247	vmull.p64 XL, HH4_L, XL_L // a0 * b0
				248	vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
				249
				250	vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
				251	veor XM2_L, XM2_L, XM2_H
				252	vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
				253	vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
				254
				255	veor XH, XH, XH2
				256	veor XL, XL, XL2
				257	veor XM, XM, XM2
				258
				259	vmull.p64 XH2, HH_H, T3_L // a1 * b1
				260	veor T3_L, T3_L, T3_H
				261	vmull.p64 XL2, HH_L, T3_H // a0 * b0
				262	vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
				263
				264	veor XH, XH, XH2
				265	veor XL, XL, XL2
				266	veor XM, XM, XM2
				267
				268	vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
				269	veor T1_L, T1_L, T1_H
				270	vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
				271	vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
				272
				273	veor XH, XH, XH2
				274	veor XL, XL, XL2
				275	veor XM, XM, XM2
				276
				277	beq 4f
				278
				279	vld1.8 {XL2-XM2}, [r2]!
				280
				281	veor T1, XL, XH
				282	veor XM, XM, T1
				283
				284	__pmull_reduce_p64
				285
				286	veor T1, T1, XH
				287	veor XL, XL, T1
				288
				289	b 1b
				290	.endif
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	291	.endif
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	292
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	293	2: vld1.8 {T1}, [r2]!
				294
				295	.ifnb \enc
				296	\enc\()_1x T1
				297	veor SHASH2_p64, SHASH_L, SHASH_H
				298	vmov.i8 MASK, #0xe1
				299	vshl.u64 MASK, MASK, #57
				300	.endif
				301
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	302	subs r0, r0, #1
				303
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	304	3: /* multiply XL by SHASH in GF(2^128) */
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	305	vrev64.8 T1, T1
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	306
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	307	vext.8 IN1, T1, T1, #8
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	308	veor T1_L, T1_L, XL_H
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	309	veor XL, XL, IN1
				310
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	311	__pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	312	veor T1, T1, XL
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	313	__pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
				314	__pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	315
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	316	4: veor T1, XL, XH
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	317	veor XM, XM, T1
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	318
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	319	__pmull_reduce_\pn
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	320
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	321	veor T1, T1, XH
				322	veor XL, XL, T1
Ard Biesheuvel	f1e866b	2015-03-10 09:47:48 +0100	[diff] [blame]	323
				324	bne 0b
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	325	.endm
				326
				327	/*
				328	* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
				329	* struct ghash_key const k, const char head)
				330	*/
				331	ENTRY(pmull_ghash_update_p64)
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	332	vld1.64 {SHASH}, [r3]!
				333	vld1.64 {HH}, [r3]!
				334	vld1.64 {HH3-HH4}, [r3]
				335
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	336	veor SHASH2_p64, SHASH_L, SHASH_H
Ard Biesheuvel	00227e3	2018-08-23 15:48:51 +0100	[diff] [blame]	337	veor SHASH2_H, HH_L, HH_H
				338	veor HH34_L, HH3_L, HH3_H
				339	veor HH34_H, HH4_L, HH4_H
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	340
				341	vmov.i8 MASK, #0xe1
				342	vshl.u64 MASK, MASK, #57
				343
				344	ghash_update p64
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	345	vst1.64 {XL}, [r1]
				346
				347	bx lr
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	348	ENDPROC(pmull_ghash_update_p64)
				349
				350	ENTRY(pmull_ghash_update_p8)
				351	vld1.64 {SHASH}, [r3]
				352	veor SHASH2_p8, SHASH_L, SHASH_H
				353
				354	vext.8 s1l, SHASH_L, SHASH_L, #1
				355	vext.8 s2l, SHASH_L, SHASH_L, #2
				356	vext.8 s3l, SHASH_L, SHASH_L, #3
				357	vext.8 s4l, SHASH_L, SHASH_L, #4
				358	vext.8 s1h, SHASH_H, SHASH_H, #1
				359	vext.8 s2h, SHASH_H, SHASH_H, #2
				360	vext.8 s3h, SHASH_H, SHASH_H, #3
				361	vext.8 s4h, SHASH_H, SHASH_H, #4
				362
				363	vmov.i64 k16, #0xffff
				364	vmov.i64 k32, #0xffffffff
				365	vmov.i64 k48, #0xffffffffffff
				366
				367	ghash_update p8
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	368	vst1.64 {XL}, [r1]
				369
				370	bx lr
Ard Biesheuvel	3759ee0	2017-07-24 11:28:17 +0100	[diff] [blame]	371	ENDPROC(pmull_ghash_update_p8)
Ard Biesheuvel	b575b5a	2023-01-16 12:01:48 +0100	[diff] [blame]	372
				373	e0 .req q9
				374	e1 .req q10
				375	e2 .req q11
				376	e3 .req q12
				377	e0l .req d18
				378	e0h .req d19
				379	e2l .req d22
				380	e2h .req d23
				381	e3l .req d24
				382	e3h .req d25
				383	ctr .req q13
				384	ctr0 .req d26
				385	ctr1 .req d27
				386
				387	ek0 .req q14
				388	ek1 .req q15
				389
				390	.macro round, rk:req, regs:vararg
				391	.irp r, \regs
				392	aese.8 \r, \rk
				393	aesmc.8 \r, \r
				394	.endr
				395	.endm
				396
				397	.macro aes_encrypt, rkp, rounds, regs:vararg
				398	vld1.8 {ek0-ek1}, [\rkp, :128]!
				399	cmp \rounds, #12
				400	blt .L\@ // AES-128
				401
				402	round ek0, \regs
				403	vld1.8 {ek0}, [\rkp, :128]!
				404	round ek1, \regs
				405	vld1.8 {ek1}, [\rkp, :128]!
				406
				407	beq .L\@ // AES-192
				408
				409	round ek0, \regs
				410	vld1.8 {ek0}, [\rkp, :128]!
				411	round ek1, \regs
				412	vld1.8 {ek1}, [\rkp, :128]!
				413
				414	.L\@: .rept 4
				415	round ek0, \regs
				416	vld1.8 {ek0}, [\rkp, :128]!
				417	round ek1, \regs
				418	vld1.8 {ek1}, [\rkp, :128]!
				419	.endr
				420
				421	round ek0, \regs
				422	vld1.8 {ek0}, [\rkp, :128]
				423
				424	.irp r, \regs
				425	aese.8 \r, ek1
				426	.endr
				427	.irp r, \regs
				428	veor \r, \r, ek0
				429	.endr
				430	.endm
				431
				432	pmull_aes_encrypt:
				433	add ip, r5, #4
				434	vld1.8 {ctr0}, [r5] // load 12 byte IV
				435	vld1.8 {ctr1}, [ip]
				436	rev r8, r7
				437	vext.8 ctr1, ctr1, ctr1, #4
				438	add r7, r7, #1
				439	vmov.32 ctr1[1], r8
				440	vmov e0, ctr
				441
				442	add ip, r3, #64
				443	aes_encrypt ip, r6, e0
				444	bx lr
				445	ENDPROC(pmull_aes_encrypt)
				446
				447	pmull_aes_encrypt_4x:
				448	add ip, r5, #4
				449	vld1.8 {ctr0}, [r5]
				450	vld1.8 {ctr1}, [ip]
				451	rev r8, r7
				452	vext.8 ctr1, ctr1, ctr1, #4
				453	add r7, r7, #1
				454	vmov.32 ctr1[1], r8
				455	rev ip, r7
				456	vmov e0, ctr
				457	add r7, r7, #1
				458	vmov.32 ctr1[1], ip
				459	rev r8, r7
				460	vmov e1, ctr
				461	add r7, r7, #1
				462	vmov.32 ctr1[1], r8
				463	rev ip, r7
				464	vmov e2, ctr
				465	add r7, r7, #1
				466	vmov.32 ctr1[1], ip
				467	vmov e3, ctr
				468
				469	add ip, r3, #64
				470	aes_encrypt ip, r6, e0, e1, e2, e3
				471	bx lr
				472	ENDPROC(pmull_aes_encrypt_4x)
				473
				474	pmull_aes_encrypt_final:
				475	add ip, r5, #4
				476	vld1.8 {ctr0}, [r5]
				477	vld1.8 {ctr1}, [ip]
				478	rev r8, r7
				479	vext.8 ctr1, ctr1, ctr1, #4
				480	mov r7, #1 << 24 // BE #1 for the tag
				481	vmov.32 ctr1[1], r8
				482	vmov e0, ctr
				483	vmov.32 ctr1[1], r7
				484	vmov e1, ctr
				485
				486	add ip, r3, #64
				487	aes_encrypt ip, r6, e0, e1
				488	bx lr
				489	ENDPROC(pmull_aes_encrypt_final)
				490
				491	.macro enc_1x, in0
				492	bl pmull_aes_encrypt
				493	veor \in0, \in0, e0
				494	vst1.8 {\in0}, [r4]!
				495	.endm
				496
				497	.macro dec_1x, in0
				498	bl pmull_aes_encrypt
				499	veor e0, e0, \in0
				500	vst1.8 {e0}, [r4]!
				501	.endm
				502
				503	.macro enc_4x, in0, in1, in2, in3
				504	bl pmull_aes_encrypt_4x
				505
				506	veor \in0, \in0, e0
				507	veor \in1, \in1, e1
				508	veor \in2, \in2, e2
				509	veor \in3, \in3, e3
				510
				511	vst1.8 {\in0-\in1}, [r4]!
				512	vst1.8 {\in2-\in3}, [r4]!
				513	.endm
				514
				515	.macro dec_4x, in0, in1, in2, in3
				516	bl pmull_aes_encrypt_4x
				517
				518	veor e0, e0, \in0
				519	veor e1, e1, \in1
				520	veor e2, e2, \in2
				521	veor e3, e3, \in3
				522
				523	vst1.8 {e0-e1}, [r4]!
				524	vst1.8 {e2-e3}, [r4]!
				525	.endm
				526
				527	/*
				528	* void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
				529	* struct gcm_key const k, char dst,
				530	* char *iv, int rounds, u32 counter)
				531	*/
				532	ENTRY(pmull_gcm_encrypt)
				533	push {r4-r8, lr}
				534	ldrd r4, r5, [sp, #24]
				535	ldrd r6, r7, [sp, #32]
				536
				537	vld1.64 {SHASH}, [r3]
				538
				539	ghash_update p64, enc, head=0
				540	vst1.64 {XL}, [r1]
				541
				542	pop {r4-r8, pc}
				543	ENDPROC(pmull_gcm_encrypt)
				544
				545	/*
				546	* void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
				547	* struct gcm_key const k, char dst,
				548	* char *iv, int rounds, u32 counter)
				549	*/
				550	ENTRY(pmull_gcm_decrypt)
				551	push {r4-r8, lr}
				552	ldrd r4, r5, [sp, #24]
				553	ldrd r6, r7, [sp, #32]
				554
				555	vld1.64 {SHASH}, [r3]
				556
				557	ghash_update p64, dec, head=0
				558	vst1.64 {XL}, [r1]
				559
				560	pop {r4-r8, pc}
				561	ENDPROC(pmull_gcm_decrypt)
				562
				563	/*
				564	* void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
				565	* struct gcm_key const k, char head,
				566	* char *iv, int rounds, u32 counter)
				567	*/
				568	ENTRY(pmull_gcm_enc_final)
				569	push {r4-r8, lr}
				570	ldrd r4, r5, [sp, #24]
				571	ldrd r6, r7, [sp, #32]
				572
				573	bl pmull_aes_encrypt_final
				574
				575	cmp r0, #0
				576	beq .Lenc_final
				577
				578	mov_l ip, .Lpermute
				579	sub r4, r4, #16
				580	add r8, ip, r0
				581	add ip, ip, #32
				582	add r4, r4, r0
				583	sub ip, ip, r0
				584
				585	vld1.8 {e3}, [r8] // permute vector for key stream
				586	vld1.8 {e2}, [ip] // permute vector for ghash input
				587
				588	vtbl.8 e3l, {e0}, e3l
				589	vtbl.8 e3h, {e0}, e3h
				590
				591	vld1.8 {e0}, [r4] // encrypt tail block
				592	veor e0, e0, e3
				593	vst1.8 {e0}, [r4]
				594
				595	vtbl.8 T1_L, {e0}, e2l
				596	vtbl.8 T1_H, {e0}, e2h
				597
				598	vld1.64 {XL}, [r1]
				599	.Lenc_final:
				600	vld1.64 {SHASH}, [r3, :128]
				601	vmov.i8 MASK, #0xe1
				602	veor SHASH2_p64, SHASH_L, SHASH_H
				603	vshl.u64 MASK, MASK, #57
				604	mov r0, #1
				605	bne 3f // process head block first
				606	ghash_update p64, aggregate=0, head=0
				607
				608	vrev64.8 XL, XL
				609	vext.8 XL, XL, XL, #8
				610	veor XL, XL, e1
				611
				612	sub r2, r2, #16 // rewind src pointer
				613	vst1.8 {XL}, [r2] // store tag
				614
				615	pop {r4-r8, pc}
				616	ENDPROC(pmull_gcm_enc_final)
				617
				618	/*
				619	* int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
				620	* struct gcm_key const k, char head,
				621	* char *iv, int rounds, u32 counter,
				622	* const char *otag, int authsize)
				623	*/
				624	ENTRY(pmull_gcm_dec_final)
				625	push {r4-r8, lr}
				626	ldrd r4, r5, [sp, #24]
				627	ldrd r6, r7, [sp, #32]
				628
				629	bl pmull_aes_encrypt_final
				630
				631	cmp r0, #0
				632	beq .Ldec_final
				633
				634	mov_l ip, .Lpermute
				635	sub r4, r4, #16
				636	add r8, ip, r0
				637	add ip, ip, #32
				638	add r4, r4, r0
				639	sub ip, ip, r0
				640
				641	vld1.8 {e3}, [r8] // permute vector for key stream
				642	vld1.8 {e2}, [ip] // permute vector for ghash input
				643
				644	vtbl.8 e3l, {e0}, e3l
				645	vtbl.8 e3h, {e0}, e3h
				646
				647	vld1.8 {e0}, [r4]
				648
				649	vtbl.8 T1_L, {e0}, e2l
				650	vtbl.8 T1_H, {e0}, e2h
				651
				652	veor e0, e0, e3
				653	vst1.8 {e0}, [r4]
				654
				655	vld1.64 {XL}, [r1]
				656	.Ldec_final:
				657	vld1.64 {SHASH}, [r3]
				658	vmov.i8 MASK, #0xe1
				659	veor SHASH2_p64, SHASH_L, SHASH_H
				660	vshl.u64 MASK, MASK, #57
				661	mov r0, #1
				662	bne 3f // process head block first
				663	ghash_update p64, aggregate=0, head=0
				664
				665	vrev64.8 XL, XL
				666	vext.8 XL, XL, XL, #8
				667	veor XL, XL, e1
				668
				669	mov_l ip, .Lpermute
				670	ldrd r2, r3, [sp, #40] // otag and authsize
				671	vld1.8 {T1}, [r2]
				672	add ip, ip, r3
				673	vceq.i8 T1, T1, XL // compare tags
				674	vmvn T1, T1 // 0 for eq, -1 for ne
				675
				676	vld1.8 {e0}, [ip]
				677	vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only
				678	vtbl.8 XL_H, {T1}, e0h
				679
				680	vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector
				681	vpmin.s8 XL_L, XL_L, XL_L
				682	vmov.32 r0, XL_L[0] // fail if != 0x0
				683
				684	pop {r4-r8, pc}
				685	ENDPROC(pmull_gcm_dec_final)
				686
				687	.section ".rodata", "a", %progbits
				688	.align 5
				689	.Lpermute:
				690	.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
				691	.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
				692	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
				693	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
				694	.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
				695	.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff