| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * sm3-neon-core.S - SM3 secure hash using NEON instructions |
| * |
| * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64 |
| * |
| * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
| * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> |
| */ |
| |
| #include <linux/linkage.h> |
| #include <linux/cfi_types.h> |
| #include <asm/assembler.h> |
| |
| /* Context structure */ |
| |
| #define state_h0 0 |
| #define state_h1 4 |
| #define state_h2 8 |
| #define state_h3 12 |
| #define state_h4 16 |
| #define state_h5 20 |
| #define state_h6 24 |
| #define state_h7 28 |
| |
| /* Stack structure */ |
| |
| #define STACK_W_SIZE (32 * 2 * 3) |
| |
| #define STACK_W (0) |
| #define STACK_SIZE (STACK_W + STACK_W_SIZE) |
| |
| /* Register macros */ |
| |
| #define RSTATE x0 |
| #define RDATA x1 |
| #define RNBLKS x2 |
| #define RKPTR x28 |
| #define RFRAME x29 |
| |
| #define ra w3 |
| #define rb w4 |
| #define rc w5 |
| #define rd w6 |
| #define re w7 |
| #define rf w8 |
| #define rg w9 |
| #define rh w10 |
| |
| #define t0 w11 |
| #define t1 w12 |
| #define t2 w13 |
| #define t3 w14 |
| #define t4 w15 |
| #define t5 w16 |
| #define t6 w17 |
| |
| #define k_even w19 |
| #define k_odd w20 |
| |
| #define addr0 x21 |
| #define addr1 x22 |
| |
| #define s0 w23 |
| #define s1 w24 |
| #define s2 w25 |
| #define s3 w26 |
| |
| #define W0 v0 |
| #define W1 v1 |
| #define W2 v2 |
| #define W3 v3 |
| #define W4 v4 |
| #define W5 v5 |
| |
| #define XTMP0 v6 |
| #define XTMP1 v7 |
| #define XTMP2 v16 |
| #define XTMP3 v17 |
| #define XTMP4 v18 |
| #define XTMP5 v19 |
| #define XTMP6 v20 |
| |
| /* Helper macros. */ |
| |
| #define _(...) /*_*/ |
| |
| #define clear_vec(x) \ |
| movi x.8h, #0; |
| |
| #define rolw(o, a, n) \ |
| ror o, a, #(32 - n); |
| |
| /* Round function macros. */ |
| |
| #define GG1_1(x, y, z, o, t) \ |
| eor o, x, y; |
| #define GG1_2(x, y, z, o, t) \ |
| eor o, o, z; |
| #define GG1_3(x, y, z, o, t) |
| |
| #define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t) |
| #define FF1_2(x, y, z, o, t) |
| #define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t) |
| |
| #define GG2_1(x, y, z, o, t) \ |
| bic o, z, x; |
| #define GG2_2(x, y, z, o, t) \ |
| and t, y, x; |
| #define GG2_3(x, y, z, o, t) \ |
| eor o, o, t; |
| |
| #define FF2_1(x, y, z, o, t) \ |
| eor o, x, y; |
| #define FF2_2(x, y, z, o, t) \ |
| and t, x, y; \ |
| and o, o, z; |
| #define FF2_3(x, y, z, o, t) \ |
| eor o, o, t; |
| |
| #define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ |
| K_LOAD(round); \ |
| ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \ |
| rolw(t0, a, 12); /* rol(a, 12) => t0 */ \ |
| IOP(1, iop_param); \ |
| FF##i##_1(a, b, c, t1, t2); \ |
| ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \ |
| add k, k, e; \ |
| IOP(2, iop_param); \ |
| GG##i##_1(e, f, g, t3, t4); \ |
| FF##i##_2(a, b, c, t1, t2); \ |
| IOP(3, iop_param); \ |
| add k, k, t0; \ |
| add h, h, t5; \ |
| add d, d, t6; /* w1w2 + d => d */ \ |
| IOP(4, iop_param); \ |
| rolw(k, k, 7); /* rol (t0 + e + t), 7) => k */ \ |
| GG##i##_2(e, f, g, t3, t4); \ |
| add h, h, k; /* h + w1 + k => h */ \ |
| IOP(5, iop_param); \ |
| FF##i##_3(a, b, c, t1, t2); \ |
| eor t0, t0, k; /* k ^ t0 => t0 */ \ |
| GG##i##_3(e, f, g, t3, t4); \ |
| add d, d, t1; /* FF(a,b,c) + d => d */ \ |
| IOP(6, iop_param); \ |
| add t3, t3, h; /* GG(e,f,g) + h => t3 */ \ |
| rolw(b, b, 9); /* rol(b, 9) => b */ \ |
| eor h, t3, t3, ror #(32-9); \ |
| IOP(7, iop_param); \ |
| add d, d, t0; /* t0 + d => d */ \ |
| rolw(f, f, 19); /* rol(f, 19) => f */ \ |
| IOP(8, iop_param); \ |
| eor h, h, t3, ror #(32-17); /* P0(t3) => h */ |
| |
| #define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ |
| R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) |
| |
| #define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ |
| R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) |
| |
| #define KL(round) \ |
| ldp k_even, k_odd, [RKPTR, #(4*(round))]; |
| |
| /* Input expansion macros. */ |
| |
| /* Byte-swapped input address. */ |
| #define IW_W_ADDR(round, widx, offs) \ |
| (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4)) |
| |
| /* Expanded input address. */ |
| #define XW_W_ADDR(round, widx, offs) \ |
| (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4)) |
| |
| /* Rounds 1-12, byte-swapped input block addresses. */ |
| #define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32) |
| #define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48) |
| |
| /* Rounds 1-12, expanded input block addresses. */ |
| #define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) |
| #define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16) |
| |
| /* Input block loading. |
| * Interleaving within round function needed for in-order CPUs. */ |
| #define LOAD_W_VEC_1_1() \ |
| add addr0, sp, #IW_W1_ADDR(0, 0); |
| #define LOAD_W_VEC_1_2() \ |
| add addr1, sp, #IW_W1_ADDR(4, 0); |
| #define LOAD_W_VEC_1_3() \ |
| ld1 {W0.16b}, [RDATA], #16; |
| #define LOAD_W_VEC_1_4() \ |
| ld1 {W1.16b}, [RDATA], #16; |
| #define LOAD_W_VEC_1_5() \ |
| ld1 {W2.16b}, [RDATA], #16; |
| #define LOAD_W_VEC_1_6() \ |
| ld1 {W3.16b}, [RDATA], #16; |
| #define LOAD_W_VEC_1_7() \ |
| rev32 XTMP0.16b, W0.16b; |
| #define LOAD_W_VEC_1_8() \ |
| rev32 XTMP1.16b, W1.16b; |
| #define LOAD_W_VEC_2_1() \ |
| rev32 XTMP2.16b, W2.16b; |
| #define LOAD_W_VEC_2_2() \ |
| rev32 XTMP3.16b, W3.16b; |
| #define LOAD_W_VEC_2_3() \ |
| eor XTMP4.16b, XTMP1.16b, XTMP0.16b; |
| #define LOAD_W_VEC_2_4() \ |
| eor XTMP5.16b, XTMP2.16b, XTMP1.16b; |
| #define LOAD_W_VEC_2_5() \ |
| st1 {XTMP0.16b}, [addr0], #16; |
| #define LOAD_W_VEC_2_6() \ |
| st1 {XTMP4.16b}, [addr0]; \ |
| add addr0, sp, #IW_W1_ADDR(8, 0); |
| #define LOAD_W_VEC_2_7() \ |
| eor XTMP6.16b, XTMP3.16b, XTMP2.16b; |
| #define LOAD_W_VEC_2_8() \ |
| ext W0.16b, XTMP0.16b, XTMP0.16b, #8; /* W0: xx, w0, xx, xx */ |
| #define LOAD_W_VEC_3_1() \ |
| mov W2.16b, XTMP1.16b; /* W2: xx, w6, w5, w4 */ |
| #define LOAD_W_VEC_3_2() \ |
| st1 {XTMP1.16b}, [addr1], #16; |
| #define LOAD_W_VEC_3_3() \ |
| st1 {XTMP5.16b}, [addr1]; \ |
| ext W1.16b, XTMP0.16b, XTMP0.16b, #4; /* W1: xx, w3, w2, w1 */ |
| #define LOAD_W_VEC_3_4() \ |
| ext W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */ |
| #define LOAD_W_VEC_3_5() \ |
| ext W4.16b, XTMP2.16b, XTMP3.16b, #8; /* W4: xx, w12, w11, w10 */ |
| #define LOAD_W_VEC_3_6() \ |
| st1 {XTMP2.16b}, [addr0], #16; |
| #define LOAD_W_VEC_3_7() \ |
| st1 {XTMP6.16b}, [addr0]; |
| #define LOAD_W_VEC_3_8() \ |
| ext W5.16b, XTMP3.16b, XTMP3.16b, #4; /* W5: xx, w15, w14, w13 */ |
| |
| #define LOAD_W_VEC_1(iop_num, ...) \ |
| LOAD_W_VEC_1_##iop_num() |
| #define LOAD_W_VEC_2(iop_num, ...) \ |
| LOAD_W_VEC_2_##iop_num() |
| #define LOAD_W_VEC_3(iop_num, ...) \ |
| LOAD_W_VEC_3_##iop_num() |
| |
| /* Message scheduling. Note: 3 words per vector register. |
| * Interleaving within round function needed for in-order CPUs. */ |
| #define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \ |
| /* Load (w[i - 16]) => XTMP0 */ \ |
| /* Load (w[i - 13]) => XTMP5 */ \ |
| ext XTMP0.16b, w0.16b, w0.16b, #12; /* XTMP0: w0, xx, xx, xx */ |
| #define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \ |
| ext XTMP5.16b, w1.16b, w1.16b, #12; |
| #define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \ |
| ext XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */ |
| #define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \ |
| ext XTMP5.16b, XTMP5.16b, w2.16b, #12; |
| #define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \ |
| /* w[i - 9] == w3 */ \ |
| /* W3 ^ XTMP0 => XTMP0 */ \ |
| eor XTMP0.16b, XTMP0.16b, w3.16b; |
| #define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \ |
| /* w[i - 3] == w5 */ \ |
| /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ |
| /* rol(XTMP5, 7) => XTMP1 */ \ |
| add addr0, sp, #XW_W1_ADDR((round), 0); \ |
| shl XTMP2.4s, w5.4s, #15; |
| #define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \ |
| shl XTMP1.4s, XTMP5.4s, #7; |
| #define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \ |
| sri XTMP2.4s, w5.4s, #(32-15); |
| #define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \ |
| sri XTMP1.4s, XTMP5.4s, #(32-7); |
| #define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \ |
| eor XTMP0.16b, XTMP0.16b, XTMP2.16b; |
| #define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \ |
| /* w[i - 6] == W4 */ \ |
| /* W4 ^ XTMP1 => XTMP1 */ \ |
| eor XTMP1.16b, XTMP1.16b, w4.16b; |
| #define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \ |
| /* P1(XTMP0) ^ XTMP1 => W0 */ \ |
| shl XTMP3.4s, XTMP0.4s, #15; |
| #define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \ |
| shl XTMP4.4s, XTMP0.4s, #23; |
| #define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \ |
| eor w0.16b, XTMP1.16b, XTMP0.16b; |
| #define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \ |
| sri XTMP3.4s, XTMP0.4s, #(32-15); |
| #define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \ |
| sri XTMP4.4s, XTMP0.4s, #(32-23); |
| #define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \ |
| eor w0.16b, w0.16b, XTMP3.16b; |
| #define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \ |
| /* Load (w[i - 3]) => XTMP2 */ \ |
| ext XTMP2.16b, w4.16b, w4.16b, #12; |
| #define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \ |
| eor w0.16b, w0.16b, XTMP4.16b; |
| #define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \ |
| ext XTMP2.16b, XTMP2.16b, w5.16b, #12; |
| #define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \ |
| /* W1 ^ W2 => XTMP3 */ \ |
| eor XTMP3.16b, XTMP2.16b, w0.16b; |
| #define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5) |
| #define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \ |
| st1 {XTMP2.16b-XTMP3.16b}, [addr0]; |
| #define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5) |
| |
| #define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \ |
| SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5) |
| #define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \ |
| SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5) |
| #define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \ |
| SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5) |
| |
| #define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \ |
| SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0) |
| #define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \ |
| SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0) |
| #define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \ |
| SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0) |
| |
| #define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \ |
| SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1) |
| #define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \ |
| SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1) |
| #define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \ |
| SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1) |
| |
| #define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \ |
| SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2) |
| #define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \ |
| SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2) |
| #define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \ |
| SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2) |
| |
| #define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \ |
| SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3) |
| #define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \ |
| SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3) |
| #define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \ |
| SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3) |
| |
| #define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \ |
| SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4) |
| #define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \ |
| SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4) |
| #define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \ |
| SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4) |
| |
| |
| /* |
| * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'. |
| * |
| * void sm3_neon_transform(struct sm3_state *sst, u8 const *src, |
| * int blocks) |
| */ |
| .text |
| .align 3 |
| SYM_TYPED_FUNC_START(sm3_neon_transform) |
| ldp ra, rb, [RSTATE, #0] |
| ldp rc, rd, [RSTATE, #8] |
| ldp re, rf, [RSTATE, #16] |
| ldp rg, rh, [RSTATE, #24] |
| |
| stp x28, x29, [sp, #-16]! |
| stp x19, x20, [sp, #-16]! |
| stp x21, x22, [sp, #-16]! |
| stp x23, x24, [sp, #-16]! |
| stp x25, x26, [sp, #-16]! |
| mov RFRAME, sp |
| |
| sub addr0, sp, #STACK_SIZE |
| adr_l RKPTR, .LKtable |
| and sp, addr0, #(~63) |
| |
| /* Preload first block. */ |
| LOAD_W_VEC_1(1, 0) |
| LOAD_W_VEC_1(2, 0) |
| LOAD_W_VEC_1(3, 0) |
| LOAD_W_VEC_1(4, 0) |
| LOAD_W_VEC_1(5, 0) |
| LOAD_W_VEC_1(6, 0) |
| LOAD_W_VEC_1(7, 0) |
| LOAD_W_VEC_1(8, 0) |
| LOAD_W_VEC_2(1, 0) |
| LOAD_W_VEC_2(2, 0) |
| LOAD_W_VEC_2(3, 0) |
| LOAD_W_VEC_2(4, 0) |
| LOAD_W_VEC_2(5, 0) |
| LOAD_W_VEC_2(6, 0) |
| LOAD_W_VEC_2(7, 0) |
| LOAD_W_VEC_2(8, 0) |
| LOAD_W_VEC_3(1, 0) |
| LOAD_W_VEC_3(2, 0) |
| LOAD_W_VEC_3(3, 0) |
| LOAD_W_VEC_3(4, 0) |
| LOAD_W_VEC_3(5, 0) |
| LOAD_W_VEC_3(6, 0) |
| LOAD_W_VEC_3(7, 0) |
| LOAD_W_VEC_3(8, 0) |
| |
| .balign 16 |
| .Loop: |
| /* Transform 0-3 */ |
| R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0) |
| R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1, 1, IW, _, 0) |
| R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0) |
| R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3, 3, IW, _, 0) |
| |
| /* Transform 4-7 + Precalc 12-14 */ |
| R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0) |
| R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5, 1, IW, _, 0) |
| R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12) |
| R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12) |
| |
| /* Transform 8-11 + Precalc 12-17 */ |
| R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12) |
| R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15) |
| R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15) |
| R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15) |
| |
| /* Transform 12-14 + Precalc 18-20 */ |
| R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18) |
| R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18) |
| R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18) |
| |
| /* Transform 15-17 + Precalc 21-23 */ |
| R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21) |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21) |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21) |
| |
| /* Transform 18-20 + Precalc 24-26 */ |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24) |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24) |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24) |
| |
| /* Transform 21-23 + Precalc 27-29 */ |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27) |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27) |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27) |
| |
| /* Transform 24-26 + Precalc 30-32 */ |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30) |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30) |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30) |
| |
| /* Transform 27-29 + Precalc 33-35 */ |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33) |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33) |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33) |
| |
| /* Transform 30-32 + Precalc 36-38 */ |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36) |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36) |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36) |
| |
| /* Transform 33-35 + Precalc 39-41 */ |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39) |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39) |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39) |
| |
| /* Transform 36-38 + Precalc 42-44 */ |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42) |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42) |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42) |
| |
| /* Transform 39-41 + Precalc 45-47 */ |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45) |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45) |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45) |
| |
| /* Transform 42-44 + Precalc 48-50 */ |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48) |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48) |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48) |
| |
| /* Transform 45-47 + Precalc 51-53 */ |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51) |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51) |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51) |
| |
| /* Transform 48-50 + Precalc 54-56 */ |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54) |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54) |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54) |
| |
| /* Transform 51-53 + Precalc 57-59 */ |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57) |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57) |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57) |
| |
| /* Transform 54-56 + Precalc 60-62 */ |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60) |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60) |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60) |
| |
| /* Transform 57-59 + Precalc 63 */ |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63) |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63) |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63) |
| |
| /* Transform 60 */ |
| R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _) |
| subs RNBLKS, RNBLKS, #1 |
| b.eq .Lend |
| |
| /* Transform 61-63 + Preload next block */ |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, LOAD_W_VEC_1, _) |
| ldp s0, s1, [RSTATE, #0] |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _) |
| ldp s2, s3, [RSTATE, #8] |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, LOAD_W_VEC_3, _) |
| |
| /* Update the chaining variables. */ |
| eor ra, ra, s0 |
| eor rb, rb, s1 |
| ldp s0, s1, [RSTATE, #16] |
| eor rc, rc, s2 |
| ldp k_even, k_odd, [RSTATE, #24] |
| eor rd, rd, s3 |
| eor re, re, s0 |
| stp ra, rb, [RSTATE, #0] |
| eor rf, rf, s1 |
| stp rc, rd, [RSTATE, #8] |
| eor rg, rg, k_even |
| stp re, rf, [RSTATE, #16] |
| eor rh, rh, k_odd |
| stp rg, rh, [RSTATE, #24] |
| b .Loop |
| |
| .Lend: |
| /* Transform 61-63 */ |
| R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, _, _) |
| ldp s0, s1, [RSTATE, #0] |
| R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _) |
| ldp s2, s3, [RSTATE, #8] |
| R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, _, _) |
| |
| /* Update the chaining variables. */ |
| eor ra, ra, s0 |
| clear_vec(W0) |
| eor rb, rb, s1 |
| clear_vec(W1) |
| ldp s0, s1, [RSTATE, #16] |
| clear_vec(W2) |
| eor rc, rc, s2 |
| clear_vec(W3) |
| ldp k_even, k_odd, [RSTATE, #24] |
| clear_vec(W4) |
| eor rd, rd, s3 |
| clear_vec(W5) |
| eor re, re, s0 |
| clear_vec(XTMP0) |
| stp ra, rb, [RSTATE, #0] |
| clear_vec(XTMP1) |
| eor rf, rf, s1 |
| clear_vec(XTMP2) |
| stp rc, rd, [RSTATE, #8] |
| clear_vec(XTMP3) |
| eor rg, rg, k_even |
| clear_vec(XTMP4) |
| stp re, rf, [RSTATE, #16] |
| clear_vec(XTMP5) |
| eor rh, rh, k_odd |
| clear_vec(XTMP6) |
| stp rg, rh, [RSTATE, #24] |
| |
| /* Clear message expansion area */ |
| add addr0, sp, #STACK_W |
| st1 {W0.16b-W3.16b}, [addr0], #64 |
| st1 {W0.16b-W3.16b}, [addr0], #64 |
| st1 {W0.16b-W3.16b}, [addr0] |
| |
| mov sp, RFRAME |
| |
| ldp x25, x26, [sp], #16 |
| ldp x23, x24, [sp], #16 |
| ldp x21, x22, [sp], #16 |
| ldp x19, x20, [sp], #16 |
| ldp x28, x29, [sp], #16 |
| |
| ret |
| SYM_FUNC_END(sm3_neon_transform) |
| |
| |
| .section ".rodata", "a" |
| |
| .align 4 |
| .LKtable: |
| .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb |
| .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc |
| .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce |
| .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6 |
| .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c |
| .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce |
| .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec |
| .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 |
| .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53 |
| .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d |
| .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4 |
| .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43 |
| .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c |
| .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce |
| .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec |
| .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 |