| /* SPDX-License-Identifier: GPL-2.0-or-later */ |
| /* |
| * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions |
| * as specified in |
| * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html |
| * |
| * Copyright (C) 2022, Alibaba Group. |
| * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> |
| */ |
| |
| #include <linux/linkage.h> |
| #include <asm/assembler.h> |
| #include "sm4-ce-asm.h" |
| |
| .arch armv8-a+crypto |
| |
| .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ |
| 20, 24, 25, 26, 27, 28, 29, 30, 31 |
| .set .Lv\b\().4s, \b |
| .endr |
| |
| .macro sm4e, vd, vn |
| .inst 0xcec08400 | (.L\vn << 5) | .L\vd |
| .endm |
| |
| .macro sm4ekey, vd, vn, vm |
| .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd |
| .endm |
| |
| /* Register macros */ |
| |
| #define RTMP0 v16 |
| #define RTMP1 v17 |
| #define RTMP2 v18 |
| #define RTMP3 v19 |
| |
| #define RIV v20 |
| #define RMAC v20 |
| #define RMASK v21 |
| |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_expand_key) |
| /* input: |
| * x0: 128-bit key |
| * x1: rkey_enc |
| * x2: rkey_dec |
| * x3: fk array |
| * x4: ck array |
| */ |
| ld1 {v0.16b}, [x0]; |
| rev32 v0.16b, v0.16b; |
| ld1 {v1.16b}, [x3]; |
| /* load ck */ |
| ld1 {v24.16b-v27.16b}, [x4], #64; |
| ld1 {v28.16b-v31.16b}, [x4]; |
| |
| /* input ^ fk */ |
| eor v0.16b, v0.16b, v1.16b; |
| |
| sm4ekey v0.4s, v0.4s, v24.4s; |
| sm4ekey v1.4s, v0.4s, v25.4s; |
| sm4ekey v2.4s, v1.4s, v26.4s; |
| sm4ekey v3.4s, v2.4s, v27.4s; |
| sm4ekey v4.4s, v3.4s, v28.4s; |
| sm4ekey v5.4s, v4.4s, v29.4s; |
| sm4ekey v6.4s, v5.4s, v30.4s; |
| sm4ekey v7.4s, v6.4s, v31.4s; |
| |
| adr_l x5, .Lbswap128_mask |
| ld1 {v24.16b}, [x5] |
| |
| st1 {v0.16b-v3.16b}, [x1], #64; |
| st1 {v4.16b-v7.16b}, [x1]; |
| |
| tbl v16.16b, {v7.16b}, v24.16b |
| tbl v17.16b, {v6.16b}, v24.16b |
| tbl v18.16b, {v5.16b}, v24.16b |
| tbl v19.16b, {v4.16b}, v24.16b |
| tbl v20.16b, {v3.16b}, v24.16b |
| tbl v21.16b, {v2.16b}, v24.16b |
| tbl v22.16b, {v1.16b}, v24.16b |
| tbl v23.16b, {v0.16b}, v24.16b |
| |
| st1 {v16.16b-v19.16b}, [x2], #64 |
| st1 {v20.16b-v23.16b}, [x2] |
| |
| ret; |
| SYM_FUNC_END(sm4_ce_expand_key) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_crypt_block) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| */ |
| SM4_PREPARE(x0) |
| |
| ld1 {v0.16b}, [x2]; |
| SM4_CRYPT_BLK(v0); |
| st1 {v0.16b}, [x1]; |
| |
| ret; |
| SYM_FUNC_END(sm4_ce_crypt_block) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_crypt) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * w3: nblocks |
| */ |
| SM4_PREPARE(x0) |
| |
| .Lcrypt_loop_blk: |
| sub w3, w3, #8; |
| tbnz w3, #31, .Lcrypt_tail8; |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64; |
| ld1 {v4.16b-v7.16b}, [x2], #64; |
| |
| SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); |
| |
| st1 {v0.16b-v3.16b}, [x1], #64; |
| st1 {v4.16b-v7.16b}, [x1], #64; |
| |
| cbz w3, .Lcrypt_end; |
| b .Lcrypt_loop_blk; |
| |
| .Lcrypt_tail8: |
| add w3, w3, #8; |
| cmp w3, #4; |
| blt .Lcrypt_tail4; |
| |
| sub w3, w3, #4; |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64; |
| SM4_CRYPT_BLK4(v0, v1, v2, v3); |
| st1 {v0.16b-v3.16b}, [x1], #64; |
| |
| cbz w3, .Lcrypt_end; |
| |
| .Lcrypt_tail4: |
| sub w3, w3, #1; |
| |
| ld1 {v0.16b}, [x2], #16; |
| SM4_CRYPT_BLK(v0); |
| st1 {v0.16b}, [x1], #16; |
| |
| cbnz w3, .Lcrypt_tail4; |
| |
| .Lcrypt_end: |
| ret; |
| SYM_FUNC_END(sm4_ce_crypt) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_cbc_enc) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * x3: iv (big endian, 128 bit) |
| * w4: nblocks |
| */ |
| SM4_PREPARE(x0) |
| |
| ld1 {RIV.16b}, [x3] |
| |
| .Lcbc_enc_loop_4x: |
| cmp w4, #4 |
| blt .Lcbc_enc_loop_1x |
| |
| sub w4, w4, #4 |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| |
| eor v0.16b, v0.16b, RIV.16b |
| SM4_CRYPT_BLK(v0) |
| eor v1.16b, v1.16b, v0.16b |
| SM4_CRYPT_BLK(v1) |
| eor v2.16b, v2.16b, v1.16b |
| SM4_CRYPT_BLK(v2) |
| eor v3.16b, v3.16b, v2.16b |
| SM4_CRYPT_BLK(v3) |
| |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| mov RIV.16b, v3.16b |
| |
| cbz w4, .Lcbc_enc_end |
| b .Lcbc_enc_loop_4x |
| |
| .Lcbc_enc_loop_1x: |
| sub w4, w4, #1 |
| |
| ld1 {v0.16b}, [x2], #16 |
| |
| eor RIV.16b, RIV.16b, v0.16b |
| SM4_CRYPT_BLK(RIV) |
| |
| st1 {RIV.16b}, [x1], #16 |
| |
| cbnz w4, .Lcbc_enc_loop_1x |
| |
| .Lcbc_enc_end: |
| /* store new IV */ |
| st1 {RIV.16b}, [x3] |
| |
| ret |
| SYM_FUNC_END(sm4_ce_cbc_enc) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_cbc_dec) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * x3: iv (big endian, 128 bit) |
| * w4: nblocks |
| */ |
| SM4_PREPARE(x0) |
| |
| ld1 {RIV.16b}, [x3] |
| |
| .Lcbc_dec_loop_8x: |
| sub w4, w4, #8 |
| tbnz w4, #31, .Lcbc_dec_4x |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| ld1 {v4.16b-v7.16b}, [x2], #64 |
| |
| rev32 v8.16b, v0.16b |
| rev32 v9.16b, v1.16b |
| rev32 v10.16b, v2.16b |
| rev32 v11.16b, v3.16b |
| rev32 v12.16b, v4.16b |
| rev32 v13.16b, v5.16b |
| rev32 v14.16b, v6.16b |
| rev32 v15.16b, v7.16b |
| |
| SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) |
| |
| eor v8.16b, v8.16b, RIV.16b |
| eor v9.16b, v9.16b, v0.16b |
| eor v10.16b, v10.16b, v1.16b |
| eor v11.16b, v11.16b, v2.16b |
| eor v12.16b, v12.16b, v3.16b |
| eor v13.16b, v13.16b, v4.16b |
| eor v14.16b, v14.16b, v5.16b |
| eor v15.16b, v15.16b, v6.16b |
| |
| st1 {v8.16b-v11.16b}, [x1], #64 |
| st1 {v12.16b-v15.16b}, [x1], #64 |
| |
| mov RIV.16b, v7.16b |
| |
| cbz w4, .Lcbc_dec_end |
| b .Lcbc_dec_loop_8x |
| |
| .Lcbc_dec_4x: |
| add w4, w4, #8 |
| cmp w4, #4 |
| blt .Lcbc_dec_loop_1x |
| |
| sub w4, w4, #4 |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| |
| rev32 v8.16b, v0.16b |
| rev32 v9.16b, v1.16b |
| rev32 v10.16b, v2.16b |
| rev32 v11.16b, v3.16b |
| |
| SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) |
| |
| eor v8.16b, v8.16b, RIV.16b |
| eor v9.16b, v9.16b, v0.16b |
| eor v10.16b, v10.16b, v1.16b |
| eor v11.16b, v11.16b, v2.16b |
| |
| st1 {v8.16b-v11.16b}, [x1], #64 |
| |
| mov RIV.16b, v3.16b |
| |
| cbz w4, .Lcbc_dec_end |
| |
| .Lcbc_dec_loop_1x: |
| sub w4, w4, #1 |
| |
| ld1 {v0.16b}, [x2], #16 |
| |
| rev32 v8.16b, v0.16b |
| |
| SM4_CRYPT_BLK_BE(v8) |
| |
| eor v8.16b, v8.16b, RIV.16b |
| st1 {v8.16b}, [x1], #16 |
| |
| mov RIV.16b, v0.16b |
| |
| cbnz w4, .Lcbc_dec_loop_1x |
| |
| .Lcbc_dec_end: |
| /* store new IV */ |
| st1 {RIV.16b}, [x3] |
| |
| ret |
| SYM_FUNC_END(sm4_ce_cbc_dec) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_cbc_cts_enc) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * x3: iv (big endian, 128 bit) |
| * w4: nbytes |
| */ |
| SM4_PREPARE(x0) |
| |
| sub w5, w4, #16 |
| uxtw x5, w5 |
| |
| ld1 {RIV.16b}, [x3] |
| |
| ld1 {v0.16b}, [x2] |
| eor RIV.16b, RIV.16b, v0.16b |
| SM4_CRYPT_BLK(RIV) |
| |
| /* load permute table */ |
| adr_l x6, .Lcts_permute_table |
| add x7, x6, #32 |
| add x6, x6, x5 |
| sub x7, x7, x5 |
| ld1 {v3.16b}, [x6] |
| ld1 {v4.16b}, [x7] |
| |
| /* overlapping loads */ |
| add x2, x2, x5 |
| ld1 {v1.16b}, [x2] |
| |
| /* create Cn from En-1 */ |
| tbl v0.16b, {RIV.16b}, v3.16b |
| /* padding Pn with zeros */ |
| tbl v1.16b, {v1.16b}, v4.16b |
| |
| eor v1.16b, v1.16b, RIV.16b |
| SM4_CRYPT_BLK(v1) |
| |
| /* overlapping stores */ |
| add x5, x1, x5 |
| st1 {v0.16b}, [x5] |
| st1 {v1.16b}, [x1] |
| |
| ret |
| SYM_FUNC_END(sm4_ce_cbc_cts_enc) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_cbc_cts_dec) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * x3: iv (big endian, 128 bit) |
| * w4: nbytes |
| */ |
| SM4_PREPARE(x0) |
| |
| sub w5, w4, #16 |
| uxtw x5, w5 |
| |
| ld1 {RIV.16b}, [x3] |
| |
| /* load permute table */ |
| adr_l x6, .Lcts_permute_table |
| add x7, x6, #32 |
| add x6, x6, x5 |
| sub x7, x7, x5 |
| ld1 {v3.16b}, [x6] |
| ld1 {v4.16b}, [x7] |
| |
| /* overlapping loads */ |
| ld1 {v0.16b}, [x2], x5 |
| ld1 {v1.16b}, [x2] |
| |
| SM4_CRYPT_BLK(v0) |
| /* select the first Ln bytes of Xn to create Pn */ |
| tbl v2.16b, {v0.16b}, v3.16b |
| eor v2.16b, v2.16b, v1.16b |
| |
| /* overwrite the first Ln bytes with Cn to create En-1 */ |
| tbx v0.16b, {v1.16b}, v4.16b |
| SM4_CRYPT_BLK(v0) |
| eor v0.16b, v0.16b, RIV.16b |
| |
| /* overlapping stores */ |
| add x5, x1, x5 |
| st1 {v2.16b}, [x5] |
| st1 {v0.16b}, [x1] |
| |
| ret |
| SYM_FUNC_END(sm4_ce_cbc_cts_dec) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_cfb_enc) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * x3: iv (big endian, 128 bit) |
| * w4: nblocks |
| */ |
| SM4_PREPARE(x0) |
| |
| ld1 {RIV.16b}, [x3] |
| |
| .Lcfb_enc_loop_4x: |
| cmp w4, #4 |
| blt .Lcfb_enc_loop_1x |
| |
| sub w4, w4, #4 |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| |
| rev32 v8.16b, RIV.16b |
| SM4_CRYPT_BLK_BE(v8) |
| eor v0.16b, v0.16b, v8.16b |
| |
| rev32 v8.16b, v0.16b |
| SM4_CRYPT_BLK_BE(v8) |
| eor v1.16b, v1.16b, v8.16b |
| |
| rev32 v8.16b, v1.16b |
| SM4_CRYPT_BLK_BE(v8) |
| eor v2.16b, v2.16b, v8.16b |
| |
| rev32 v8.16b, v2.16b |
| SM4_CRYPT_BLK_BE(v8) |
| eor v3.16b, v3.16b, v8.16b |
| |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| mov RIV.16b, v3.16b |
| |
| cbz w4, .Lcfb_enc_end |
| b .Lcfb_enc_loop_4x |
| |
| .Lcfb_enc_loop_1x: |
| sub w4, w4, #1 |
| |
| ld1 {v0.16b}, [x2], #16 |
| |
| SM4_CRYPT_BLK(RIV) |
| eor RIV.16b, RIV.16b, v0.16b |
| |
| st1 {RIV.16b}, [x1], #16 |
| |
| cbnz w4, .Lcfb_enc_loop_1x |
| |
| .Lcfb_enc_end: |
| /* store new IV */ |
| st1 {RIV.16b}, [x3] |
| |
| ret |
| SYM_FUNC_END(sm4_ce_cfb_enc) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_cfb_dec) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * x3: iv (big endian, 128 bit) |
| * w4: nblocks |
| */ |
| SM4_PREPARE(x0) |
| |
| ld1 {RIV.16b}, [x3] |
| |
| .Lcfb_dec_loop_8x: |
| sub w4, w4, #8 |
| tbnz w4, #31, .Lcfb_dec_4x |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| ld1 {v4.16b-v7.16b}, [x2], #64 |
| |
| rev32 v8.16b, RIV.16b |
| rev32 v9.16b, v0.16b |
| rev32 v10.16b, v1.16b |
| rev32 v11.16b, v2.16b |
| rev32 v12.16b, v3.16b |
| rev32 v13.16b, v4.16b |
| rev32 v14.16b, v5.16b |
| rev32 v15.16b, v6.16b |
| |
| SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) |
| |
| mov RIV.16b, v7.16b |
| |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| eor v4.16b, v4.16b, v12.16b |
| eor v5.16b, v5.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| eor v7.16b, v7.16b, v15.16b |
| |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| st1 {v4.16b-v7.16b}, [x1], #64 |
| |
| cbz w4, .Lcfb_dec_end |
| b .Lcfb_dec_loop_8x |
| |
| .Lcfb_dec_4x: |
| add w4, w4, #8 |
| cmp w4, #4 |
| blt .Lcfb_dec_loop_1x |
| |
| sub w4, w4, #4 |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| |
| rev32 v8.16b, RIV.16b |
| rev32 v9.16b, v0.16b |
| rev32 v10.16b, v1.16b |
| rev32 v11.16b, v2.16b |
| |
| SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) |
| |
| mov RIV.16b, v3.16b |
| |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| |
| cbz w4, .Lcfb_dec_end |
| |
| .Lcfb_dec_loop_1x: |
| sub w4, w4, #1 |
| |
| ld1 {v0.16b}, [x2], #16 |
| |
| SM4_CRYPT_BLK(RIV) |
| |
| eor RIV.16b, RIV.16b, v0.16b |
| st1 {RIV.16b}, [x1], #16 |
| |
| mov RIV.16b, v0.16b |
| |
| cbnz w4, .Lcfb_dec_loop_1x |
| |
| .Lcfb_dec_end: |
| /* store new IV */ |
| st1 {RIV.16b}, [x3] |
| |
| ret |
| SYM_FUNC_END(sm4_ce_cfb_dec) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_ctr_enc) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * x3: ctr (big endian, 128 bit) |
| * w4: nblocks |
| */ |
| SM4_PREPARE(x0) |
| |
| ldp x7, x8, [x3] |
| rev x7, x7 |
| rev x8, x8 |
| |
| .Lctr_loop_8x: |
| sub w4, w4, #8 |
| tbnz w4, #31, .Lctr_4x |
| |
| #define inc_le128(vctr) \ |
| mov vctr.d[1], x8; \ |
| mov vctr.d[0], x7; \ |
| adds x8, x8, #1; \ |
| rev64 vctr.16b, vctr.16b; \ |
| adc x7, x7, xzr; |
| |
| /* construct CTRs */ |
| inc_le128(v0) /* +0 */ |
| inc_le128(v1) /* +1 */ |
| inc_le128(v2) /* +2 */ |
| inc_le128(v3) /* +3 */ |
| inc_le128(v4) /* +4 */ |
| inc_le128(v5) /* +5 */ |
| inc_le128(v6) /* +6 */ |
| inc_le128(v7) /* +7 */ |
| |
| ld1 {v8.16b-v11.16b}, [x2], #64 |
| ld1 {v12.16b-v15.16b}, [x2], #64 |
| |
| SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) |
| |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| eor v4.16b, v4.16b, v12.16b |
| eor v5.16b, v5.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| eor v7.16b, v7.16b, v15.16b |
| |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| st1 {v4.16b-v7.16b}, [x1], #64 |
| |
| cbz w4, .Lctr_end |
| b .Lctr_loop_8x |
| |
| .Lctr_4x: |
| add w4, w4, #8 |
| cmp w4, #4 |
| blt .Lctr_loop_1x |
| |
| sub w4, w4, #4 |
| |
| /* construct CTRs */ |
| inc_le128(v0) /* +0 */ |
| inc_le128(v1) /* +1 */ |
| inc_le128(v2) /* +2 */ |
| inc_le128(v3) /* +3 */ |
| |
| ld1 {v8.16b-v11.16b}, [x2], #64 |
| |
| SM4_CRYPT_BLK4(v0, v1, v2, v3) |
| |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| |
| cbz w4, .Lctr_end |
| |
| .Lctr_loop_1x: |
| sub w4, w4, #1 |
| |
| /* construct CTRs */ |
| inc_le128(v0) |
| |
| ld1 {v8.16b}, [x2], #16 |
| |
| SM4_CRYPT_BLK(v0) |
| |
| eor v0.16b, v0.16b, v8.16b |
| st1 {v0.16b}, [x1], #16 |
| |
| cbnz w4, .Lctr_loop_1x |
| |
| .Lctr_end: |
| /* store new CTR */ |
| rev x7, x7 |
| rev x8, x8 |
| stp x7, x8, [x3] |
| |
| ret |
| SYM_FUNC_END(sm4_ce_ctr_enc) |
| |
| |
| #define tweak_next(vt, vin, RTMP) \ |
| sshr RTMP.2d, vin.2d, #63; \ |
| and RTMP.16b, RTMP.16b, RMASK.16b; \ |
| add vt.2d, vin.2d, vin.2d; \ |
| ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \ |
| eor vt.16b, vt.16b, RTMP.16b; |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_xts_enc) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * x3: tweak (big endian, 128 bit) |
| * w4: nbytes |
| * x5: round key array for IV |
| */ |
| ld1 {v8.16b}, [x3] |
| |
| cbz x5, .Lxts_enc_nofirst |
| |
| SM4_PREPARE(x5) |
| |
| /* Generate first tweak */ |
| SM4_CRYPT_BLK(v8) |
| |
| .Lxts_enc_nofirst: |
| SM4_PREPARE(x0) |
| |
| ands w5, w4, #15 |
| lsr w4, w4, #4 |
| sub w6, w4, #1 |
| csel w4, w4, w6, eq |
| uxtw x5, w5 |
| |
| movi RMASK.2s, #0x1 |
| movi RTMP0.2s, #0x87 |
| uzp1 RMASK.4s, RMASK.4s, RTMP0.4s |
| |
| cbz w4, .Lxts_enc_cts |
| |
| .Lxts_enc_loop_8x: |
| sub w4, w4, #8 |
| tbnz w4, #31, .Lxts_enc_4x |
| |
| tweak_next( v9, v8, RTMP0) |
| tweak_next(v10, v9, RTMP1) |
| tweak_next(v11, v10, RTMP2) |
| tweak_next(v12, v11, RTMP3) |
| tweak_next(v13, v12, RTMP0) |
| tweak_next(v14, v13, RTMP1) |
| tweak_next(v15, v14, RTMP2) |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| ld1 {v4.16b-v7.16b}, [x2], #64 |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| eor v4.16b, v4.16b, v12.16b |
| eor v5.16b, v5.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| eor v7.16b, v7.16b, v15.16b |
| |
| SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) |
| |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| eor v4.16b, v4.16b, v12.16b |
| eor v5.16b, v5.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| eor v7.16b, v7.16b, v15.16b |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| st1 {v4.16b-v7.16b}, [x1], #64 |
| |
| tweak_next(v8, v15, RTMP3) |
| |
| cbz w4, .Lxts_enc_cts |
| b .Lxts_enc_loop_8x |
| |
| .Lxts_enc_4x: |
| add w4, w4, #8 |
| cmp w4, #4 |
| blt .Lxts_enc_loop_1x |
| |
| sub w4, w4, #4 |
| |
| tweak_next( v9, v8, RTMP0) |
| tweak_next(v10, v9, RTMP1) |
| tweak_next(v11, v10, RTMP2) |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| |
| SM4_CRYPT_BLK4(v0, v1, v2, v3) |
| |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| |
| tweak_next(v8, v11, RTMP3) |
| |
| cbz w4, .Lxts_enc_cts |
| |
| .Lxts_enc_loop_1x: |
| sub w4, w4, #1 |
| |
| ld1 {v0.16b}, [x2], #16 |
| eor v0.16b, v0.16b, v8.16b |
| |
| SM4_CRYPT_BLK(v0) |
| |
| eor v0.16b, v0.16b, v8.16b |
| st1 {v0.16b}, [x1], #16 |
| |
| tweak_next(v8, v8, RTMP0) |
| |
| cbnz w4, .Lxts_enc_loop_1x |
| |
| .Lxts_enc_cts: |
| cbz x5, .Lxts_enc_end |
| |
| /* cipher text stealing */ |
| |
| tweak_next(v9, v8, RTMP0) |
| ld1 {v0.16b}, [x2] |
| eor v0.16b, v0.16b, v8.16b |
| SM4_CRYPT_BLK(v0) |
| eor v0.16b, v0.16b, v8.16b |
| |
| /* load permute table */ |
| adr_l x6, .Lcts_permute_table |
| add x7, x6, #32 |
| add x6, x6, x5 |
| sub x7, x7, x5 |
| ld1 {v3.16b}, [x6] |
| ld1 {v4.16b}, [x7] |
| |
| /* overlapping loads */ |
| add x2, x2, x5 |
| ld1 {v1.16b}, [x2] |
| |
| /* create Cn from En-1 */ |
| tbl v2.16b, {v0.16b}, v3.16b |
| /* padding Pn with En-1 at the end */ |
| tbx v0.16b, {v1.16b}, v4.16b |
| |
| eor v0.16b, v0.16b, v9.16b |
| SM4_CRYPT_BLK(v0) |
| eor v0.16b, v0.16b, v9.16b |
| |
| |
| /* overlapping stores */ |
| add x5, x1, x5 |
| st1 {v2.16b}, [x5] |
| st1 {v0.16b}, [x1] |
| |
| b .Lxts_enc_ret |
| |
| .Lxts_enc_end: |
| /* store new tweak */ |
| st1 {v8.16b}, [x3] |
| |
| .Lxts_enc_ret: |
| ret |
| SYM_FUNC_END(sm4_ce_xts_enc) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_xts_dec) |
| /* input: |
| * x0: round key array, CTX |
| * x1: dst |
| * x2: src |
| * x3: tweak (big endian, 128 bit) |
| * w4: nbytes |
| * x5: round key array for IV |
| */ |
| ld1 {v8.16b}, [x3] |
| |
| cbz x5, .Lxts_dec_nofirst |
| |
| SM4_PREPARE(x5) |
| |
| /* Generate first tweak */ |
| SM4_CRYPT_BLK(v8) |
| |
| .Lxts_dec_nofirst: |
| SM4_PREPARE(x0) |
| |
| ands w5, w4, #15 |
| lsr w4, w4, #4 |
| sub w6, w4, #1 |
| csel w4, w4, w6, eq |
| uxtw x5, w5 |
| |
| movi RMASK.2s, #0x1 |
| movi RTMP0.2s, #0x87 |
| uzp1 RMASK.4s, RMASK.4s, RTMP0.4s |
| |
| cbz w4, .Lxts_dec_cts |
| |
| .Lxts_dec_loop_8x: |
| sub w4, w4, #8 |
| tbnz w4, #31, .Lxts_dec_4x |
| |
| tweak_next( v9, v8, RTMP0) |
| tweak_next(v10, v9, RTMP1) |
| tweak_next(v11, v10, RTMP2) |
| tweak_next(v12, v11, RTMP3) |
| tweak_next(v13, v12, RTMP0) |
| tweak_next(v14, v13, RTMP1) |
| tweak_next(v15, v14, RTMP2) |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| ld1 {v4.16b-v7.16b}, [x2], #64 |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| eor v4.16b, v4.16b, v12.16b |
| eor v5.16b, v5.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| eor v7.16b, v7.16b, v15.16b |
| |
| SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) |
| |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| eor v4.16b, v4.16b, v12.16b |
| eor v5.16b, v5.16b, v13.16b |
| eor v6.16b, v6.16b, v14.16b |
| eor v7.16b, v7.16b, v15.16b |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| st1 {v4.16b-v7.16b}, [x1], #64 |
| |
| tweak_next(v8, v15, RTMP3) |
| |
| cbz w4, .Lxts_dec_cts |
| b .Lxts_dec_loop_8x |
| |
| .Lxts_dec_4x: |
| add w4, w4, #8 |
| cmp w4, #4 |
| blt .Lxts_dec_loop_1x |
| |
| sub w4, w4, #4 |
| |
| tweak_next( v9, v8, RTMP0) |
| tweak_next(v10, v9, RTMP1) |
| tweak_next(v11, v10, RTMP2) |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| |
| SM4_CRYPT_BLK4(v0, v1, v2, v3) |
| |
| eor v0.16b, v0.16b, v8.16b |
| eor v1.16b, v1.16b, v9.16b |
| eor v2.16b, v2.16b, v10.16b |
| eor v3.16b, v3.16b, v11.16b |
| st1 {v0.16b-v3.16b}, [x1], #64 |
| |
| tweak_next(v8, v11, RTMP3) |
| |
| cbz w4, .Lxts_dec_cts |
| |
| .Lxts_dec_loop_1x: |
| sub w4, w4, #1 |
| |
| ld1 {v0.16b}, [x2], #16 |
| eor v0.16b, v0.16b, v8.16b |
| |
| SM4_CRYPT_BLK(v0) |
| |
| eor v0.16b, v0.16b, v8.16b |
| st1 {v0.16b}, [x1], #16 |
| |
| tweak_next(v8, v8, RTMP0) |
| |
| cbnz w4, .Lxts_dec_loop_1x |
| |
| .Lxts_dec_cts: |
| cbz x5, .Lxts_dec_end |
| |
| /* cipher text stealing */ |
| |
| tweak_next(v9, v8, RTMP0) |
| ld1 {v0.16b}, [x2] |
| eor v0.16b, v0.16b, v9.16b |
| SM4_CRYPT_BLK(v0) |
| eor v0.16b, v0.16b, v9.16b |
| |
| /* load permute table */ |
| adr_l x6, .Lcts_permute_table |
| add x7, x6, #32 |
| add x6, x6, x5 |
| sub x7, x7, x5 |
| ld1 {v3.16b}, [x6] |
| ld1 {v4.16b}, [x7] |
| |
| /* overlapping loads */ |
| add x2, x2, x5 |
| ld1 {v1.16b}, [x2] |
| |
| /* create Cn from En-1 */ |
| tbl v2.16b, {v0.16b}, v3.16b |
| /* padding Pn with En-1 at the end */ |
| tbx v0.16b, {v1.16b}, v4.16b |
| |
| eor v0.16b, v0.16b, v8.16b |
| SM4_CRYPT_BLK(v0) |
| eor v0.16b, v0.16b, v8.16b |
| |
| |
| /* overlapping stores */ |
| add x5, x1, x5 |
| st1 {v2.16b}, [x5] |
| st1 {v0.16b}, [x1] |
| |
| b .Lxts_dec_ret |
| |
| .Lxts_dec_end: |
| /* store new tweak */ |
| st1 {v8.16b}, [x3] |
| |
| .Lxts_dec_ret: |
| ret |
| SYM_FUNC_END(sm4_ce_xts_dec) |
| |
| .align 3 |
| SYM_FUNC_START(sm4_ce_mac_update) |
| /* input: |
| * x0: round key array, CTX |
| * x1: digest |
| * x2: src |
| * w3: nblocks |
| * w4: enc_before |
| * w5: enc_after |
| */ |
| SM4_PREPARE(x0) |
| |
| ld1 {RMAC.16b}, [x1] |
| |
| cbz w4, .Lmac_update |
| |
| SM4_CRYPT_BLK(RMAC) |
| |
| .Lmac_update: |
| cbz w3, .Lmac_ret |
| |
| sub w6, w3, #1 |
| cmp w5, wzr |
| csel w3, w3, w6, ne |
| |
| cbz w3, .Lmac_end |
| |
| .Lmac_loop_4x: |
| cmp w3, #4 |
| blt .Lmac_loop_1x |
| |
| sub w3, w3, #4 |
| |
| ld1 {v0.16b-v3.16b}, [x2], #64 |
| |
| eor RMAC.16b, RMAC.16b, v0.16b |
| SM4_CRYPT_BLK(RMAC) |
| eor RMAC.16b, RMAC.16b, v1.16b |
| SM4_CRYPT_BLK(RMAC) |
| eor RMAC.16b, RMAC.16b, v2.16b |
| SM4_CRYPT_BLK(RMAC) |
| eor RMAC.16b, RMAC.16b, v3.16b |
| SM4_CRYPT_BLK(RMAC) |
| |
| cbz w3, .Lmac_end |
| b .Lmac_loop_4x |
| |
| .Lmac_loop_1x: |
| sub w3, w3, #1 |
| |
| ld1 {v0.16b}, [x2], #16 |
| |
| eor RMAC.16b, RMAC.16b, v0.16b |
| SM4_CRYPT_BLK(RMAC) |
| |
| cbnz w3, .Lmac_loop_1x |
| |
| |
| .Lmac_end: |
| cbnz w5, .Lmac_ret |
| |
| ld1 {v0.16b}, [x2], #16 |
| eor RMAC.16b, RMAC.16b, v0.16b |
| |
| .Lmac_ret: |
| st1 {RMAC.16b}, [x1] |
| ret |
| SYM_FUNC_END(sm4_ce_mac_update) |
| |
| |
| .section ".rodata", "a" |
| .align 4 |
| .Lbswap128_mask: |
| .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b |
| .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 |
| |
| .Lcts_permute_table: |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
| .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |