| // SPDX-License-Identifier: GPL-2.0 |
| |
| #include <linux/linkage.h> |
| #include <asm/cache.h> |
| #include <asm/assembler.h> |
| |
| .text |
| |
| #define state0 v0 |
| #define state1 v1 |
| #define state2 v2 |
| #define state3 v3 |
| #define copy0 v4 |
| #define copy0_q q4 |
| #define copy1 v5 |
| #define copy2 v6 |
| #define copy3 v7 |
| #define copy3_d d7 |
| #define one_d d16 |
| #define one_q q16 |
| #define one_v v16 |
| #define tmp v17 |
| #define rot8 v18 |
| |
| /* |
| * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive |
| * number of blocks of output with nonce 0, taking an input key and 8-bytes |
| * counter. Importantly does not spill to the stack. |
| * |
| * This implementation avoids d8-d15 because they are callee-save in user |
| * space. |
| * |
| * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, |
| * const uint8_t *key, |
| * uint32_t *counter, |
| * size_t nblocks) |
| * |
| * x0: output bytes |
| * x1: 32-byte key input |
| * x2: 8-byte counter input/output |
| * x3: number of 64-byte block to write to output |
| */ |
| SYM_FUNC_START(__arch_chacha20_blocks_nostack) |
| |
| /* copy0 = "expand 32-byte k" */ |
| mov_q x8, 0x3320646e61707865 |
| mov_q x9, 0x6b20657479622d32 |
| mov copy0.d[0], x8 |
| mov copy0.d[1], x9 |
| |
| /* copy1,copy2 = key */ |
| ld1 { copy1.4s, copy2.4s }, [x1] |
| /* copy3 = counter || zero nonce */ |
| ld1 { copy3.2s }, [x2] |
| |
| movi one_v.2s, #1 |
| uzp1 one_v.4s, one_v.4s, one_v.4s |
| |
| .Lblock: |
| /* copy state to auxiliary vectors for the final add after the permute. */ |
| mov state0.16b, copy0.16b |
| mov state1.16b, copy1.16b |
| mov state2.16b, copy2.16b |
| mov state3.16b, copy3.16b |
| |
| mov w4, 20 |
| .Lpermute: |
| /* |
| * Permute one 64-byte block where the state matrix is stored in the four NEON |
| * registers state0-state3. It performs matrix operations on four words in parallel, |
| * but requires shuffling to rearrange the words after each round. |
| */ |
| |
| .Ldoubleround: |
| /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ |
| add state0.4s, state0.4s, state1.4s |
| eor state3.16b, state3.16b, state0.16b |
| rev32 state3.8h, state3.8h |
| |
| /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ |
| add state2.4s, state2.4s, state3.4s |
| eor tmp.16b, state1.16b, state2.16b |
| shl state1.4s, tmp.4s, #12 |
| sri state1.4s, tmp.4s, #20 |
| |
| /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ |
| add state0.4s, state0.4s, state1.4s |
| eor tmp.16b, state3.16b, state0.16b |
| shl state3.4s, tmp.4s, #8 |
| sri state3.4s, tmp.4s, #24 |
| |
| /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ |
| add state2.4s, state2.4s, state3.4s |
| eor tmp.16b, state1.16b, state2.16b |
| shl state1.4s, tmp.4s, #7 |
| sri state1.4s, tmp.4s, #25 |
| |
| /* state1[0,1,2,3] = state1[1,2,3,0] */ |
| ext state1.16b, state1.16b, state1.16b, #4 |
| /* state2[0,1,2,3] = state2[2,3,0,1] */ |
| ext state2.16b, state2.16b, state2.16b, #8 |
| /* state3[0,1,2,3] = state3[1,2,3,0] */ |
| ext state3.16b, state3.16b, state3.16b, #12 |
| |
| /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ |
| add state0.4s, state0.4s, state1.4s |
| eor state3.16b, state3.16b, state0.16b |
| rev32 state3.8h, state3.8h |
| |
| /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ |
| add state2.4s, state2.4s, state3.4s |
| eor tmp.16b, state1.16b, state2.16b |
| shl state1.4s, tmp.4s, #12 |
| sri state1.4s, tmp.4s, #20 |
| |
| /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ |
| add state0.4s, state0.4s, state1.4s |
| eor tmp.16b, state3.16b, state0.16b |
| shl state3.4s, tmp.4s, #8 |
| sri state3.4s, tmp.4s, #24 |
| |
| /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ |
| add state2.4s, state2.4s, state3.4s |
| eor tmp.16b, state1.16b, state2.16b |
| shl state1.4s, tmp.4s, #7 |
| sri state1.4s, tmp.4s, #25 |
| |
| /* state1[0,1,2,3] = state1[3,0,1,2] */ |
| ext state1.16b, state1.16b, state1.16b, #12 |
| /* state2[0,1,2,3] = state2[2,3,0,1] */ |
| ext state2.16b, state2.16b, state2.16b, #8 |
| /* state3[0,1,2,3] = state3[1,2,3,0] */ |
| ext state3.16b, state3.16b, state3.16b, #4 |
| |
| subs w4, w4, #2 |
| b.ne .Ldoubleround |
| |
| /* output0 = state0 + state0 */ |
| add state0.4s, state0.4s, copy0.4s |
| /* output1 = state1 + state1 */ |
| add state1.4s, state1.4s, copy1.4s |
| /* output2 = state2 + state2 */ |
| add state2.4s, state2.4s, copy2.4s |
| /* output2 = state3 + state3 */ |
| add state3.4s, state3.4s, copy3.4s |
| st1 { state0.16b - state3.16b }, [x0] |
| |
| /* |
| * ++copy3.counter, the 'add' clears the upper half of the SIMD register |
| * which is the expected behaviour here. |
| */ |
| add copy3_d, copy3_d, one_d |
| |
| /* output += 64, --nblocks */ |
| add x0, x0, 64 |
| subs x3, x3, #1 |
| b.ne .Lblock |
| |
| /* counter = copy3.counter */ |
| st1 { copy3.2s }, [x2] |
| |
| /* Zero out the potentially sensitive regs, in case nothing uses these again. */ |
| movi state0.16b, #0 |
| movi state1.16b, #0 |
| movi state2.16b, #0 |
| movi state3.16b, #0 |
| movi copy1.16b, #0 |
| movi copy2.16b, #0 |
| ret |
| SYM_FUNC_END(__arch_chacha20_blocks_nostack) |
| |
| emit_aarch64_feature_1_and |