| /* SPDX-License-Identifier: GPL-2.0-only */ |
| /* |
| * SSE2 implementation of MORUS-640 |
| * |
| * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> |
| * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. |
| */ |
| |
| #include <linux/linkage.h> |
| #include <asm/frame.h> |
| |
| #define SHUFFLE_MASK(i0, i1, i2, i3) \ |
| (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) |
| |
| #define MASK1 SHUFFLE_MASK(3, 0, 1, 2) |
| #define MASK2 SHUFFLE_MASK(2, 3, 0, 1) |
| #define MASK3 SHUFFLE_MASK(1, 2, 3, 0) |
| |
| #define STATE0 %xmm0 |
| #define STATE1 %xmm1 |
| #define STATE2 %xmm2 |
| #define STATE3 %xmm3 |
| #define STATE4 %xmm4 |
| #define KEY %xmm5 |
| #define MSG %xmm5 |
| #define T0 %xmm6 |
| #define T1 %xmm7 |
| |
| .section .rodata.cst16.morus640_const, "aM", @progbits, 32 |
| .align 16 |
| .Lmorus640_const_0: |
| .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d |
| .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 |
| .Lmorus640_const_1: |
| .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 |
| .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd |
| |
| .section .rodata.cst16.morus640_counter, "aM", @progbits, 16 |
| .align 16 |
| .Lmorus640_counter: |
| .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 |
| .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f |
| |
| .text |
| |
| .macro morus640_round s0, s1, s2, s3, s4, b, w |
| movdqa \s1, T0 |
| pand \s2, T0 |
| pxor T0, \s0 |
| pxor \s3, \s0 |
| movdqa \s0, T0 |
| pslld $\b, T0 |
| psrld $(32 - \b), \s0 |
| pxor T0, \s0 |
| pshufd $\w, \s3, \s3 |
| .endm |
| |
| /* |
| * __morus640_update: internal ABI |
| * input: |
| * STATE[0-4] - input state |
| * MSG - message block |
| * output: |
| * STATE[0-4] - output state |
| * changed: |
| * T0 |
| */ |
| __morus640_update: |
| morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 |
| pxor MSG, STATE1 |
| morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 |
| pxor MSG, STATE2 |
| morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 |
| pxor MSG, STATE3 |
| morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 |
| pxor MSG, STATE4 |
| morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 |
| ret |
| ENDPROC(__morus640_update) |
| |
| |
| /* |
| * __morus640_update_zero: internal ABI |
| * input: |
| * STATE[0-4] - input state |
| * output: |
| * STATE[0-4] - output state |
| * changed: |
| * T0 |
| */ |
| __morus640_update_zero: |
| morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 |
| morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 |
| morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 |
| morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 |
| morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 |
| ret |
| ENDPROC(__morus640_update_zero) |
| |
| /* |
| * __load_partial: internal ABI |
| * input: |
| * %rsi - src |
| * %rcx - bytes |
| * output: |
| * MSG - message block |
| * changed: |
| * T0 |
| * %r8 |
| * %r9 |
| */ |
| __load_partial: |
| xor %r9d, %r9d |
| pxor MSG, MSG |
| |
| mov %rcx, %r8 |
| and $0x1, %r8 |
| jz .Lld_partial_1 |
| |
| mov %rcx, %r8 |
| and $0x1E, %r8 |
| add %rsi, %r8 |
| mov (%r8), %r9b |
| |
| .Lld_partial_1: |
| mov %rcx, %r8 |
| and $0x2, %r8 |
| jz .Lld_partial_2 |
| |
| mov %rcx, %r8 |
| and $0x1C, %r8 |
| add %rsi, %r8 |
| shl $16, %r9 |
| mov (%r8), %r9w |
| |
| .Lld_partial_2: |
| mov %rcx, %r8 |
| and $0x4, %r8 |
| jz .Lld_partial_4 |
| |
| mov %rcx, %r8 |
| and $0x18, %r8 |
| add %rsi, %r8 |
| shl $32, %r9 |
| mov (%r8), %r8d |
| xor %r8, %r9 |
| |
| .Lld_partial_4: |
| movq %r9, MSG |
| |
| mov %rcx, %r8 |
| and $0x8, %r8 |
| jz .Lld_partial_8 |
| |
| mov %rcx, %r8 |
| and $0x10, %r8 |
| add %rsi, %r8 |
| pslldq $8, MSG |
| movq (%r8), T0 |
| pxor T0, MSG |
| |
| .Lld_partial_8: |
| ret |
| ENDPROC(__load_partial) |
| |
| /* |
| * __store_partial: internal ABI |
| * input: |
| * %rdx - dst |
| * %rcx - bytes |
| * output: |
| * T0 - message block |
| * changed: |
| * %r8 |
| * %r9 |
| * %r10 |
| */ |
| __store_partial: |
| mov %rcx, %r8 |
| mov %rdx, %r9 |
| |
| movq T0, %r10 |
| |
| cmp $8, %r8 |
| jl .Lst_partial_8 |
| |
| mov %r10, (%r9) |
| psrldq $8, T0 |
| movq T0, %r10 |
| |
| sub $8, %r8 |
| add $8, %r9 |
| |
| .Lst_partial_8: |
| cmp $4, %r8 |
| jl .Lst_partial_4 |
| |
| mov %r10d, (%r9) |
| shr $32, %r10 |
| |
| sub $4, %r8 |
| add $4, %r9 |
| |
| .Lst_partial_4: |
| cmp $2, %r8 |
| jl .Lst_partial_2 |
| |
| mov %r10w, (%r9) |
| shr $16, %r10 |
| |
| sub $2, %r8 |
| add $2, %r9 |
| |
| .Lst_partial_2: |
| cmp $1, %r8 |
| jl .Lst_partial_1 |
| |
| mov %r10b, (%r9) |
| |
| .Lst_partial_1: |
| ret |
| ENDPROC(__store_partial) |
| |
| /* |
| * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv); |
| */ |
| ENTRY(crypto_morus640_sse2_init) |
| FRAME_BEGIN |
| |
| /* load IV: */ |
| movdqu (%rdx), STATE0 |
| /* load key: */ |
| movdqu (%rsi), KEY |
| movdqa KEY, STATE1 |
| /* load all ones: */ |
| pcmpeqd STATE2, STATE2 |
| /* load the constants: */ |
| movdqa .Lmorus640_const_0, STATE3 |
| movdqa .Lmorus640_const_1, STATE4 |
| |
| /* update 16 times with zero: */ |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| call __morus640_update_zero |
| |
| /* xor-in the key again after updates: */ |
| pxor KEY, STATE1 |
| |
| /* store the state: */ |
| movdqu STATE0, (0 * 16)(%rdi) |
| movdqu STATE1, (1 * 16)(%rdi) |
| movdqu STATE2, (2 * 16)(%rdi) |
| movdqu STATE3, (3 * 16)(%rdi) |
| movdqu STATE4, (4 * 16)(%rdi) |
| |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus640_sse2_init) |
| |
| /* |
| * void crypto_morus640_sse2_ad(void *state, const void *data, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus640_sse2_ad) |
| FRAME_BEGIN |
| |
| cmp $16, %rdx |
| jb .Lad_out |
| |
| /* load the state: */ |
| movdqu (0 * 16)(%rdi), STATE0 |
| movdqu (1 * 16)(%rdi), STATE1 |
| movdqu (2 * 16)(%rdi), STATE2 |
| movdqu (3 * 16)(%rdi), STATE3 |
| movdqu (4 * 16)(%rdi), STATE4 |
| |
| mov %rsi, %r8 |
| and $0xF, %r8 |
| jnz .Lad_u_loop |
| |
| .align 4 |
| .Lad_a_loop: |
| movdqa (%rsi), MSG |
| call __morus640_update |
| sub $16, %rdx |
| add $16, %rsi |
| cmp $16, %rdx |
| jge .Lad_a_loop |
| |
| jmp .Lad_cont |
| .align 4 |
| .Lad_u_loop: |
| movdqu (%rsi), MSG |
| call __morus640_update |
| sub $16, %rdx |
| add $16, %rsi |
| cmp $16, %rdx |
| jge .Lad_u_loop |
| |
| .Lad_cont: |
| /* store the state: */ |
| movdqu STATE0, (0 * 16)(%rdi) |
| movdqu STATE1, (1 * 16)(%rdi) |
| movdqu STATE2, (2 * 16)(%rdi) |
| movdqu STATE3, (3 * 16)(%rdi) |
| movdqu STATE4, (4 * 16)(%rdi) |
| |
| .Lad_out: |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus640_sse2_ad) |
| |
| /* |
| * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus640_sse2_enc) |
| FRAME_BEGIN |
| |
| cmp $16, %rcx |
| jb .Lenc_out |
| |
| /* load the state: */ |
| movdqu (0 * 16)(%rdi), STATE0 |
| movdqu (1 * 16)(%rdi), STATE1 |
| movdqu (2 * 16)(%rdi), STATE2 |
| movdqu (3 * 16)(%rdi), STATE3 |
| movdqu (4 * 16)(%rdi), STATE4 |
| |
| mov %rsi, %r8 |
| or %rdx, %r8 |
| and $0xF, %r8 |
| jnz .Lenc_u_loop |
| |
| .align 4 |
| .Lenc_a_loop: |
| movdqa (%rsi), MSG |
| movdqa MSG, T0 |
| pxor STATE0, T0 |
| pshufd $MASK3, STATE1, T1 |
| pxor T1, T0 |
| movdqa STATE2, T1 |
| pand STATE3, T1 |
| pxor T1, T0 |
| movdqa T0, (%rdx) |
| |
| call __morus640_update |
| sub $16, %rcx |
| add $16, %rsi |
| add $16, %rdx |
| cmp $16, %rcx |
| jge .Lenc_a_loop |
| |
| jmp .Lenc_cont |
| .align 4 |
| .Lenc_u_loop: |
| movdqu (%rsi), MSG |
| movdqa MSG, T0 |
| pxor STATE0, T0 |
| pshufd $MASK3, STATE1, T1 |
| pxor T1, T0 |
| movdqa STATE2, T1 |
| pand STATE3, T1 |
| pxor T1, T0 |
| movdqu T0, (%rdx) |
| |
| call __morus640_update |
| sub $16, %rcx |
| add $16, %rsi |
| add $16, %rdx |
| cmp $16, %rcx |
| jge .Lenc_u_loop |
| |
| .Lenc_cont: |
| /* store the state: */ |
| movdqu STATE0, (0 * 16)(%rdi) |
| movdqu STATE1, (1 * 16)(%rdi) |
| movdqu STATE2, (2 * 16)(%rdi) |
| movdqu STATE3, (3 * 16)(%rdi) |
| movdqu STATE4, (4 * 16)(%rdi) |
| |
| .Lenc_out: |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus640_sse2_enc) |
| |
| /* |
| * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus640_sse2_enc_tail) |
| FRAME_BEGIN |
| |
| /* load the state: */ |
| movdqu (0 * 16)(%rdi), STATE0 |
| movdqu (1 * 16)(%rdi), STATE1 |
| movdqu (2 * 16)(%rdi), STATE2 |
| movdqu (3 * 16)(%rdi), STATE3 |
| movdqu (4 * 16)(%rdi), STATE4 |
| |
| /* encrypt message: */ |
| call __load_partial |
| |
| movdqa MSG, T0 |
| pxor STATE0, T0 |
| pshufd $MASK3, STATE1, T1 |
| pxor T1, T0 |
| movdqa STATE2, T1 |
| pand STATE3, T1 |
| pxor T1, T0 |
| |
| call __store_partial |
| |
| call __morus640_update |
| |
| /* store the state: */ |
| movdqu STATE0, (0 * 16)(%rdi) |
| movdqu STATE1, (1 * 16)(%rdi) |
| movdqu STATE2, (2 * 16)(%rdi) |
| movdqu STATE3, (3 * 16)(%rdi) |
| movdqu STATE4, (4 * 16)(%rdi) |
| |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus640_sse2_enc_tail) |
| |
| /* |
| * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus640_sse2_dec) |
| FRAME_BEGIN |
| |
| cmp $16, %rcx |
| jb .Ldec_out |
| |
| /* load the state: */ |
| movdqu (0 * 16)(%rdi), STATE0 |
| movdqu (1 * 16)(%rdi), STATE1 |
| movdqu (2 * 16)(%rdi), STATE2 |
| movdqu (3 * 16)(%rdi), STATE3 |
| movdqu (4 * 16)(%rdi), STATE4 |
| |
| mov %rsi, %r8 |
| or %rdx, %r8 |
| and $0xF, %r8 |
| jnz .Ldec_u_loop |
| |
| .align 4 |
| .Ldec_a_loop: |
| movdqa (%rsi), MSG |
| pxor STATE0, MSG |
| pshufd $MASK3, STATE1, T0 |
| pxor T0, MSG |
| movdqa STATE2, T0 |
| pand STATE3, T0 |
| pxor T0, MSG |
| movdqa MSG, (%rdx) |
| |
| call __morus640_update |
| sub $16, %rcx |
| add $16, %rsi |
| add $16, %rdx |
| cmp $16, %rcx |
| jge .Ldec_a_loop |
| |
| jmp .Ldec_cont |
| .align 4 |
| .Ldec_u_loop: |
| movdqu (%rsi), MSG |
| pxor STATE0, MSG |
| pshufd $MASK3, STATE1, T0 |
| pxor T0, MSG |
| movdqa STATE2, T0 |
| pand STATE3, T0 |
| pxor T0, MSG |
| movdqu MSG, (%rdx) |
| |
| call __morus640_update |
| sub $16, %rcx |
| add $16, %rsi |
| add $16, %rdx |
| cmp $16, %rcx |
| jge .Ldec_u_loop |
| |
| .Ldec_cont: |
| /* store the state: */ |
| movdqu STATE0, (0 * 16)(%rdi) |
| movdqu STATE1, (1 * 16)(%rdi) |
| movdqu STATE2, (2 * 16)(%rdi) |
| movdqu STATE3, (3 * 16)(%rdi) |
| movdqu STATE4, (4 * 16)(%rdi) |
| |
| .Ldec_out: |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus640_sse2_dec) |
| |
| /* |
| * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst, |
| * unsigned int length); |
| */ |
| ENTRY(crypto_morus640_sse2_dec_tail) |
| FRAME_BEGIN |
| |
| /* load the state: */ |
| movdqu (0 * 16)(%rdi), STATE0 |
| movdqu (1 * 16)(%rdi), STATE1 |
| movdqu (2 * 16)(%rdi), STATE2 |
| movdqu (3 * 16)(%rdi), STATE3 |
| movdqu (4 * 16)(%rdi), STATE4 |
| |
| /* decrypt message: */ |
| call __load_partial |
| |
| pxor STATE0, MSG |
| pshufd $MASK3, STATE1, T0 |
| pxor T0, MSG |
| movdqa STATE2, T0 |
| pand STATE3, T0 |
| pxor T0, MSG |
| movdqa MSG, T0 |
| |
| call __store_partial |
| |
| /* mask with byte count: */ |
| movq %rcx, T0 |
| punpcklbw T0, T0 |
| punpcklbw T0, T0 |
| punpcklbw T0, T0 |
| punpcklbw T0, T0 |
| movdqa .Lmorus640_counter, T1 |
| pcmpgtb T1, T0 |
| pand T0, MSG |
| |
| call __morus640_update |
| |
| /* store the state: */ |
| movdqu STATE0, (0 * 16)(%rdi) |
| movdqu STATE1, (1 * 16)(%rdi) |
| movdqu STATE2, (2 * 16)(%rdi) |
| movdqu STATE3, (3 * 16)(%rdi) |
| movdqu STATE4, (4 * 16)(%rdi) |
| |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus640_sse2_dec_tail) |
| |
| /* |
| * void crypto_morus640_sse2_final(void *state, void *tag_xor, |
| * u64 assoclen, u64 cryptlen); |
| */ |
| ENTRY(crypto_morus640_sse2_final) |
| FRAME_BEGIN |
| |
| /* load the state: */ |
| movdqu (0 * 16)(%rdi), STATE0 |
| movdqu (1 * 16)(%rdi), STATE1 |
| movdqu (2 * 16)(%rdi), STATE2 |
| movdqu (3 * 16)(%rdi), STATE3 |
| movdqu (4 * 16)(%rdi), STATE4 |
| |
| /* xor state[0] into state[4]: */ |
| pxor STATE0, STATE4 |
| |
| /* prepare length block: */ |
| movq %rdx, MSG |
| movq %rcx, T0 |
| pslldq $8, T0 |
| pxor T0, MSG |
| psllq $3, MSG /* multiply by 8 (to get bit count) */ |
| |
| /* update state: */ |
| call __morus640_update |
| call __morus640_update |
| call __morus640_update |
| call __morus640_update |
| call __morus640_update |
| call __morus640_update |
| call __morus640_update |
| call __morus640_update |
| call __morus640_update |
| call __morus640_update |
| |
| /* xor tag: */ |
| movdqu (%rsi), MSG |
| |
| pxor STATE0, MSG |
| pshufd $MASK3, STATE1, T0 |
| pxor T0, MSG |
| movdqa STATE2, T0 |
| pand STATE3, T0 |
| pxor T0, MSG |
| |
| movdqu MSG, (%rsi) |
| |
| FRAME_END |
| ret |
| ENDPROC(crypto_morus640_sse2_final) |