/*
 * SSE2 implementation of MORUS-1280
 *
 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
 */

#include <linux/linkage.h>
#include <asm/frame.h>

#define SHUFFLE_MASK(i0, i1, i2, i3) \
	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))

#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)

#define STATE0_LO	%xmm0
#define STATE0_HI	%xmm1
#define STATE1_LO	%xmm2
#define STATE1_HI	%xmm3
#define STATE2_LO	%xmm4
#define STATE2_HI	%xmm5
#define STATE3_LO	%xmm6
#define STATE3_HI	%xmm7
#define STATE4_LO	%xmm8
#define STATE4_HI	%xmm9
#define KEY_LO		%xmm10
#define KEY_HI		%xmm11
#define MSG_LO		%xmm10
#define MSG_HI		%xmm11
#define T0_LO		%xmm12
#define T0_HI		%xmm13
#define T1_LO		%xmm14
#define T1_HI		%xmm15

.section .rodata.cst16.morus640_const, "aM", @progbits, 16
.align 16
.Lmorus640_const_0:
	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
.Lmorus640_const_1:
	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd

.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
.align 16
.Lmorus640_counter_0:
	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
.Lmorus640_counter_1:
	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f

.text

.macro rol1 hi, lo
	/*
	 * HI_1 | HI_0 || LO_1 | LO_0
	 *  ==>
	 * HI_0 | HI_1 || LO_1 | LO_0
	 *  ==>
	 * HI_0 | LO_1 || LO_0 | HI_1
	 */
	pshufd $MASK2, \hi, \hi
	movdqa \hi, T0_LO
	punpcklqdq \lo, T0_LO
	punpckhqdq \hi, \lo
	movdqa \lo, \hi
	movdqa T0_LO, \lo
.endm

.macro rol2 hi, lo
	movdqa \lo, T0_LO
	movdqa \hi, \lo
	movdqa T0_LO, \hi
.endm

.macro rol3 hi, lo
	/*
	 * HI_1 | HI_0 || LO_1 | LO_0
	 *  ==>
	 * HI_0 | HI_1 || LO_1 | LO_0
	 *  ==>
	 * LO_0 | HI_1 || HI_0 | LO_1
	 */
	pshufd $MASK2, \hi, \hi
	movdqa \lo, T0_LO
	punpckhqdq \hi, T0_LO
	punpcklqdq \lo, \hi
	movdqa T0_LO, \lo
.endm

.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
	movdqa \s1_l, T0_LO
	pand \s2_l, T0_LO
	pxor T0_LO, \s0_l

	movdqa \s1_h, T0_LO
	pand \s2_h, T0_LO
	pxor T0_LO, \s0_h

	pxor \s3_l, \s0_l
	pxor \s3_h, \s0_h

	movdqa \s0_l, T0_LO
	psllq $\b, T0_LO
	psrlq $(64 - \b), \s0_l
	pxor T0_LO, \s0_l

	movdqa \s0_h, T0_LO
	psllq $\b, T0_LO
	psrlq $(64 - \b), \s0_h
	pxor T0_LO, \s0_h

	\w \s3_h, \s3_l
.endm

/*
 * __morus1280_update: internal ABI
 * input:
 *   STATE[0-4] - input state
 *   MSG        - message block
 * output:
 *   STATE[0-4] - output state
 * changed:
 *   T0
 */
__morus1280_update:
	morus1280_round \
		STATE0_LO, STATE0_HI, \
		STATE1_LO, STATE1_HI, \
		STATE2_LO, STATE2_HI, \
		STATE3_LO, STATE3_HI, \
		STATE4_LO, STATE4_HI, \
		13, rol1
	pxor MSG_LO, STATE1_LO
	pxor MSG_HI, STATE1_HI
	morus1280_round \
		STATE1_LO, STATE1_HI, \
		STATE2_LO, STATE2_HI, \
		STATE3_LO, STATE3_HI, \
		STATE4_LO, STATE4_HI, \
		STATE0_LO, STATE0_HI, \
		46, rol2
	pxor MSG_LO, STATE2_LO
	pxor MSG_HI, STATE2_HI
	morus1280_round \
		STATE2_LO, STATE2_HI, \
		STATE3_LO, STATE3_HI, \
		STATE4_LO, STATE4_HI, \
		STATE0_LO, STATE0_HI, \
		STATE1_LO, STATE1_HI, \
		38, rol3
	pxor MSG_LO, STATE3_LO
	pxor MSG_HI, STATE3_HI
	morus1280_round \
		STATE3_LO, STATE3_HI, \
		STATE4_LO, STATE4_HI, \
		STATE0_LO, STATE0_HI, \
		STATE1_LO, STATE1_HI, \
		STATE2_LO, STATE2_HI, \
		7, rol2
	pxor MSG_LO, STATE4_LO
	pxor MSG_HI, STATE4_HI
	morus1280_round \
		STATE4_LO, STATE4_HI, \
		STATE0_LO, STATE0_HI, \
		STATE1_LO, STATE1_HI, \
		STATE2_LO, STATE2_HI, \
		STATE3_LO, STATE3_HI, \
		4, rol1
	ret
ENDPROC(__morus1280_update)

/*
 * __morus1280_update_zero: internal ABI
 * input:
 *   STATE[0-4] - input state
 * output:
 *   STATE[0-4] - output state
 * changed:
 *   T0
 */
__morus1280_update_zero:
	morus1280_round \
		STATE0_LO, STATE0_HI, \
		STATE1_LO, STATE1_HI, \
		STATE2_LO, STATE2_HI, \
		STATE3_LO, STATE3_HI, \
		STATE4_LO, STATE4_HI, \
		13, rol1
	morus1280_round \
		STATE1_LO, STATE1_HI, \
		STATE2_LO, STATE2_HI, \
		STATE3_LO, STATE3_HI, \
		STATE4_LO, STATE4_HI, \
		STATE0_LO, STATE0_HI, \
		46, rol2
	morus1280_round \
		STATE2_LO, STATE2_HI, \
		STATE3_LO, STATE3_HI, \
		STATE4_LO, STATE4_HI, \
		STATE0_LO, STATE0_HI, \
		STATE1_LO, STATE1_HI, \
		38, rol3
	morus1280_round \
		STATE3_LO, STATE3_HI, \
		STATE4_LO, STATE4_HI, \
		STATE0_LO, STATE0_HI, \
		STATE1_LO, STATE1_HI, \
		STATE2_LO, STATE2_HI, \
		7, rol2
	morus1280_round \
		STATE4_LO, STATE4_HI, \
		STATE0_LO, STATE0_HI, \
		STATE1_LO, STATE1_HI, \
		STATE2_LO, STATE2_HI, \
		STATE3_LO, STATE3_HI, \
		4, rol1
	ret
ENDPROC(__morus1280_update_zero)

/*
 * __load_partial: internal ABI
 * input:
 *   %rsi - src
 *   %rcx - bytes
 * output:
 *   MSG  - message block
 * changed:
 *   %r8
 *   %r9
 */
__load_partial:
	xor %r9, %r9
	pxor MSG_LO, MSG_LO
	pxor MSG_HI, MSG_HI

	mov %rcx, %r8
	and $0x1, %r8
	jz .Lld_partial_1

	mov %rcx, %r8
	and $0x1E, %r8
	add %rsi, %r8
	mov (%r8), %r9b

.Lld_partial_1:
	mov %rcx, %r8
	and $0x2, %r8
	jz .Lld_partial_2

	mov %rcx, %r8
	and $0x1C, %r8
	add %rsi, %r8
	shl $16, %r9
	mov (%r8), %r9w

.Lld_partial_2:
	mov %rcx, %r8
	and $0x4, %r8
	jz .Lld_partial_4

	mov %rcx, %r8
	and $0x18, %r8
	add %rsi, %r8
	shl $32, %r9
	mov (%r8), %r8d
	xor %r8, %r9

.Lld_partial_4:
	movq %r9, MSG_LO

	mov %rcx, %r8
	and $0x8, %r8
	jz .Lld_partial_8

	mov %rcx, %r8
	and $0x10, %r8
	add %rsi, %r8
	pslldq $8, MSG_LO
	movq (%r8), T0_LO
	pxor T0_LO, MSG_LO

.Lld_partial_8:
	mov %rcx, %r8
	and $0x10, %r8
	jz .Lld_partial_16

	movdqa MSG_LO, MSG_HI
	movdqu (%rsi), MSG_LO

.Lld_partial_16:
	ret
ENDPROC(__load_partial)

/*
 * __store_partial: internal ABI
 * input:
 *   %rdx - dst
 *   %rcx - bytes
 * output:
 *   T0   - message block
 * changed:
 *   %r8
 *   %r9
 *   %r10
 */
__store_partial:
	mov %rcx, %r8
	mov %rdx, %r9

	cmp $16, %r8
	jl .Lst_partial_16

	movdqu T0_LO, (%r9)
	movdqa T0_HI, T0_LO

	sub $16, %r8
	add $16, %r9

.Lst_partial_16:
	movq T0_LO, %r10

	cmp $8, %r8
	jl .Lst_partial_8

	mov %r10, (%r9)
	psrldq $8, T0_LO
	movq T0_LO, %r10

	sub $8, %r8
	add $8, %r9

.Lst_partial_8:
	cmp $4, %r8
	jl .Lst_partial_4

	mov %r10d, (%r9)
	shr $32, %r10

	sub $4, %r8
	add $4, %r9

.Lst_partial_4:
	cmp $2, %r8
	jl .Lst_partial_2

	mov %r10w, (%r9)
	shr $16, %r10

	sub $2, %r8
	add $2, %r9

.Lst_partial_2:
	cmp $1, %r8
	jl .Lst_partial_1

	mov %r10b, (%r9)

.Lst_partial_1:
	ret
ENDPROC(__store_partial)

/*
 * void crypto_morus1280_sse2_init(void *state, const void *key,
 *                                 const void *iv);
 */
ENTRY(crypto_morus1280_sse2_init)
	FRAME_BEGIN

	/* load IV: */
	pxor STATE0_HI, STATE0_HI
	movdqu (%rdx), STATE0_LO
	/* load key: */
	movdqu  0(%rsi), KEY_LO
	movdqu 16(%rsi), KEY_HI
	movdqa KEY_LO, STATE1_LO
	movdqa KEY_HI, STATE1_HI
	/* load all ones: */
	pcmpeqd STATE2_LO, STATE2_LO
	pcmpeqd STATE2_HI, STATE2_HI
	/* load all zeros: */
	pxor STATE3_LO, STATE3_LO
	pxor STATE3_HI, STATE3_HI
	/* load the constant: */
	movdqa .Lmorus640_const_0, STATE4_LO
	movdqa .Lmorus640_const_1, STATE4_HI

	/* update 16 times with zero: */
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero
	call __morus1280_update_zero

	/* xor-in the key again after updates: */
	pxor KEY_LO, STATE1_LO
	pxor KEY_HI, STATE1_HI

	/* store the state: */
	movdqu STATE0_LO, (0 * 16)(%rdi)
	movdqu STATE0_HI, (1 * 16)(%rdi)
	movdqu STATE1_LO, (2 * 16)(%rdi)
	movdqu STATE1_HI, (3 * 16)(%rdi)
	movdqu STATE2_LO, (4 * 16)(%rdi)
	movdqu STATE2_HI, (5 * 16)(%rdi)
	movdqu STATE3_LO, (6 * 16)(%rdi)
	movdqu STATE3_HI, (7 * 16)(%rdi)
	movdqu STATE4_LO, (8 * 16)(%rdi)
	movdqu STATE4_HI, (9 * 16)(%rdi)

	FRAME_END
	ret
ENDPROC(crypto_morus1280_sse2_init)

/*
 * void crypto_morus1280_sse2_ad(void *state, const void *data,
 *                               unsigned int length);
 */
ENTRY(crypto_morus1280_sse2_ad)
	FRAME_BEGIN

	cmp $32, %rdx
	jb .Lad_out

	/* load the state: */
	movdqu (0 * 16)(%rdi), STATE0_LO
	movdqu (1 * 16)(%rdi), STATE0_HI
	movdqu (2 * 16)(%rdi), STATE1_LO
	movdqu (3 * 16)(%rdi), STATE1_HI
	movdqu (4 * 16)(%rdi), STATE2_LO
	movdqu (5 * 16)(%rdi), STATE2_HI
	movdqu (6 * 16)(%rdi), STATE3_LO
	movdqu (7 * 16)(%rdi), STATE3_HI
	movdqu (8 * 16)(%rdi), STATE4_LO
	movdqu (9 * 16)(%rdi), STATE4_HI

	mov %rsi, %r8
	and $0xF, %r8
	jnz .Lad_u_loop

.align 4
.Lad_a_loop:
	movdqa  0(%rsi), MSG_LO
	movdqa 16(%rsi), MSG_HI
	call __morus1280_update
	sub $32, %rdx
	add $32, %rsi
	cmp $32, %rdx
	jge .Lad_a_loop

	jmp .Lad_cont
.align 4
.Lad_u_loop:
	movdqu  0(%rsi), MSG_LO
	movdqu 16(%rsi), MSG_HI
	call __morus1280_update
	sub $32, %rdx
	add $32, %rsi
	cmp $32, %rdx
	jge .Lad_u_loop

.Lad_cont:
	/* store the state: */
	movdqu STATE0_LO, (0 * 16)(%rdi)
	movdqu STATE0_HI, (1 * 16)(%rdi)
	movdqu STATE1_LO, (2 * 16)(%rdi)
	movdqu STATE1_HI, (3 * 16)(%rdi)
	movdqu STATE2_LO, (4 * 16)(%rdi)
	movdqu STATE2_HI, (5 * 16)(%rdi)
	movdqu STATE3_LO, (6 * 16)(%rdi)
	movdqu STATE3_HI, (7 * 16)(%rdi)
	movdqu STATE4_LO, (8 * 16)(%rdi)
	movdqu STATE4_HI, (9 * 16)(%rdi)

.Lad_out:
	FRAME_END
	ret
ENDPROC(crypto_morus1280_sse2_ad)

/*
 * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
 *                                unsigned int length);
 */
ENTRY(crypto_morus1280_sse2_enc)
	FRAME_BEGIN

	cmp $32, %rcx
	jb .Lenc_out

	/* load the state: */
	movdqu (0 * 16)(%rdi), STATE0_LO
	movdqu (1 * 16)(%rdi), STATE0_HI
	movdqu (2 * 16)(%rdi), STATE1_LO
	movdqu (3 * 16)(%rdi), STATE1_HI
	movdqu (4 * 16)(%rdi), STATE2_LO
	movdqu (5 * 16)(%rdi), STATE2_HI
	movdqu (6 * 16)(%rdi), STATE3_LO
	movdqu (7 * 16)(%rdi), STATE3_HI
	movdqu (8 * 16)(%rdi), STATE4_LO
	movdqu (9 * 16)(%rdi), STATE4_HI

	mov %rsi, %r8
	or  %rdx, %r8
	and $0xF, %r8
	jnz .Lenc_u_loop

.align 4
.Lenc_a_loop:
	movdqa  0(%rsi), MSG_LO
	movdqa 16(%rsi), MSG_HI
	movdqa STATE1_LO, T1_LO
	movdqa STATE1_HI, T1_HI
	rol3 T1_HI, T1_LO
	movdqa MSG_LO, T0_LO
	movdqa MSG_HI, T0_HI
	pxor T1_LO, T0_LO
	pxor T1_HI, T0_HI
	pxor STATE0_LO, T0_LO
	pxor STATE0_HI, T0_HI
	movdqa STATE2_LO, T1_LO
	movdqa STATE2_HI, T1_HI
	pand STATE3_LO, T1_LO
	pand STATE3_HI, T1_HI
	pxor T1_LO, T0_LO
	pxor T1_HI, T0_HI
	movdqa T0_LO,  0(%rdx)
	movdqa T0_HI, 16(%rdx)

	call __morus1280_update
	sub $32, %rcx
	add $32, %rsi
	add $32, %rdx
	cmp $32, %rcx
	jge .Lenc_a_loop

	jmp .Lenc_cont
.align 4
.Lenc_u_loop:
	movdqu  0(%rsi), MSG_LO
	movdqu 16(%rsi), MSG_HI
	movdqa STATE1_LO, T1_LO
	movdqa STATE1_HI, T1_HI
	rol3 T1_HI, T1_LO
	movdqa MSG_LO, T0_LO
	movdqa MSG_HI, T0_HI
	pxor T1_LO, T0_LO
	pxor T1_HI, T0_HI
	pxor STATE0_LO, T0_LO
	pxor STATE0_HI, T0_HI
	movdqa STATE2_LO, T1_LO
	movdqa STATE2_HI, T1_HI
	pand STATE3_LO, T1_LO
	pand STATE3_HI, T1_HI
	pxor T1_LO, T0_LO
	pxor T1_HI, T0_HI
	movdqu T0_LO,  0(%rdx)
	movdqu T0_HI, 16(%rdx)

	call __morus1280_update
	sub $32, %rcx
	add $32, %rsi
	add $32, %rdx
	cmp $32, %rcx
	jge .Lenc_u_loop

.Lenc_cont:
	/* store the state: */
	movdqu STATE0_LO, (0 * 16)(%rdi)
	movdqu STATE0_HI, (1 * 16)(%rdi)
	movdqu STATE1_LO, (2 * 16)(%rdi)
	movdqu STATE1_HI, (3 * 16)(%rdi)
	movdqu STATE2_LO, (4 * 16)(%rdi)
	movdqu STATE2_HI, (5 * 16)(%rdi)
	movdqu STATE3_LO, (6 * 16)(%rdi)
	movdqu STATE3_HI, (7 * 16)(%rdi)
	movdqu STATE4_LO, (8 * 16)(%rdi)
	movdqu STATE4_HI, (9 * 16)(%rdi)

.Lenc_out:
	FRAME_END
	ret
ENDPROC(crypto_morus1280_sse2_enc)

/*
 * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
 *                                     unsigned int length);
 */
ENTRY(crypto_morus1280_sse2_enc_tail)
	FRAME_BEGIN

	/* load the state: */
	movdqu (0 * 16)(%rdi), STATE0_LO
	movdqu (1 * 16)(%rdi), STATE0_HI
	movdqu (2 * 16)(%rdi), STATE1_LO
	movdqu (3 * 16)(%rdi), STATE1_HI
	movdqu (4 * 16)(%rdi), STATE2_LO
	movdqu (5 * 16)(%rdi), STATE2_HI
	movdqu (6 * 16)(%rdi), STATE3_LO
	movdqu (7 * 16)(%rdi), STATE3_HI
	movdqu (8 * 16)(%rdi), STATE4_LO
	movdqu (9 * 16)(%rdi), STATE4_HI

	/* encrypt message: */
	call __load_partial

	movdqa STATE1_LO, T1_LO
	movdqa STATE1_HI, T1_HI
	rol3 T1_HI, T1_LO
	movdqa MSG_LO, T0_LO
	movdqa MSG_HI, T0_HI
	pxor T1_LO, T0_LO
	pxor T1_HI, T0_HI
	pxor STATE0_LO, T0_LO
	pxor STATE0_HI, T0_HI
	movdqa STATE2_LO, T1_LO
	movdqa STATE2_HI, T1_HI
	pand STATE3_LO, T1_LO
	pand STATE3_HI, T1_HI
	pxor T1_LO, T0_LO
	pxor T1_HI, T0_HI

	call __store_partial

	call __morus1280_update

	/* store the state: */
	movdqu STATE0_LO, (0 * 16)(%rdi)
	movdqu STATE0_HI, (1 * 16)(%rdi)
	movdqu STATE1_LO, (2 * 16)(%rdi)
	movdqu STATE1_HI, (3 * 16)(%rdi)
	movdqu STATE2_LO, (4 * 16)(%rdi)
	movdqu STATE2_HI, (5 * 16)(%rdi)
	movdqu STATE3_LO, (6 * 16)(%rdi)
	movdqu STATE3_HI, (7 * 16)(%rdi)
	movdqu STATE4_LO, (8 * 16)(%rdi)
	movdqu STATE4_HI, (9 * 16)(%rdi)

	FRAME_END
ENDPROC(crypto_morus1280_sse2_enc_tail)

/*
 * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
 *                                unsigned int length);
 */
ENTRY(crypto_morus1280_sse2_dec)
	FRAME_BEGIN

	cmp $32, %rcx
	jb .Ldec_out

	/* load the state: */
	movdqu (0 * 16)(%rdi), STATE0_LO
	movdqu (1 * 16)(%rdi), STATE0_HI
	movdqu (2 * 16)(%rdi), STATE1_LO
	movdqu (3 * 16)(%rdi), STATE1_HI
	movdqu (4 * 16)(%rdi), STATE2_LO
	movdqu (5 * 16)(%rdi), STATE2_HI
	movdqu (6 * 16)(%rdi), STATE3_LO
	movdqu (7 * 16)(%rdi), STATE3_HI
	movdqu (8 * 16)(%rdi), STATE4_LO
	movdqu (9 * 16)(%rdi), STATE4_HI

	mov %rsi, %r8
	or  %rdx, %r8
	and $0xF, %r8
	jnz .Ldec_u_loop

.align 4
.Ldec_a_loop:
	movdqa  0(%rsi), MSG_LO
	movdqa 16(%rsi), MSG_HI
	pxor STATE0_LO, MSG_LO
	pxor STATE0_HI, MSG_HI
	movdqa STATE1_LO, T1_LO
	movdqa STATE1_HI, T1_HI
	rol3 T1_HI, T1_LO
	pxor T1_LO, MSG_LO
	pxor T1_HI, MSG_HI
	movdqa STATE2_LO, T1_LO
	movdqa STATE2_HI, T1_HI
	pand STATE3_LO, T1_LO
	pand STATE3_HI, T1_HI
	pxor T1_LO, MSG_LO
	pxor T1_HI, MSG_HI
	movdqa MSG_LO,  0(%rdx)
	movdqa MSG_HI, 16(%rdx)

	call __morus1280_update
	sub $32, %rcx
	add $32, %rsi
	add $32, %rdx
	cmp $32, %rcx
	jge .Ldec_a_loop

	jmp .Ldec_cont
.align 4
.Ldec_u_loop:
	movdqu  0(%rsi), MSG_LO
	movdqu 16(%rsi), MSG_HI
	pxor STATE0_LO, MSG_LO
	pxor STATE0_HI, MSG_HI
	movdqa STATE1_LO, T1_LO
	movdqa STATE1_HI, T1_HI
	rol3 T1_HI, T1_LO
	pxor T1_LO, MSG_LO
	pxor T1_HI, MSG_HI
	movdqa STATE2_LO, T1_LO
	movdqa STATE2_HI, T1_HI
	pand STATE3_LO, T1_LO
	pand STATE3_HI, T1_HI
	pxor T1_LO, MSG_LO
	pxor T1_HI, MSG_HI
	movdqu MSG_LO,  0(%rdx)
	movdqu MSG_HI, 16(%rdx)

	call __morus1280_update
	sub $32, %rcx
	add $32, %rsi
	add $32, %rdx
	cmp $32, %rcx
	jge .Ldec_u_loop

.Ldec_cont:
	/* store the state: */
	movdqu STATE0_LO, (0 * 16)(%rdi)
	movdqu STATE0_HI, (1 * 16)(%rdi)
	movdqu STATE1_LO, (2 * 16)(%rdi)
	movdqu STATE1_HI, (3 * 16)(%rdi)
	movdqu STATE2_LO, (4 * 16)(%rdi)
	movdqu STATE2_HI, (5 * 16)(%rdi)
	movdqu STATE3_LO, (6 * 16)(%rdi)
	movdqu STATE3_HI, (7 * 16)(%rdi)
	movdqu STATE4_LO, (8 * 16)(%rdi)
	movdqu STATE4_HI, (9 * 16)(%rdi)

.Ldec_out:
	FRAME_END
	ret
ENDPROC(crypto_morus1280_sse2_dec)

/*
 * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
 *                                     unsigned int length);
 */
ENTRY(crypto_morus1280_sse2_dec_tail)
	FRAME_BEGIN

	/* load the state: */
	movdqu (0 * 16)(%rdi), STATE0_LO
	movdqu (1 * 16)(%rdi), STATE0_HI
	movdqu (2 * 16)(%rdi), STATE1_LO
	movdqu (3 * 16)(%rdi), STATE1_HI
	movdqu (4 * 16)(%rdi), STATE2_LO
	movdqu (5 * 16)(%rdi), STATE2_HI
	movdqu (6 * 16)(%rdi), STATE3_LO
	movdqu (7 * 16)(%rdi), STATE3_HI
	movdqu (8 * 16)(%rdi), STATE4_LO
	movdqu (9 * 16)(%rdi), STATE4_HI

	/* decrypt message: */
	call __load_partial

	pxor STATE0_LO, MSG_LO
	pxor STATE0_HI, MSG_HI
	movdqa STATE1_LO, T1_LO
	movdqa STATE1_HI, T1_HI
	rol3 T1_HI, T1_LO
	pxor T1_LO, MSG_LO
	pxor T1_HI, MSG_HI
	movdqa STATE2_LO, T1_LO
	movdqa STATE2_HI, T1_HI
	pand STATE3_LO, T1_LO
	pand STATE3_HI, T1_HI
	pxor T1_LO, MSG_LO
	pxor T1_HI, MSG_HI
	movdqa MSG_LO, T0_LO
	movdqa MSG_HI, T0_HI

	call __store_partial

	/* mask with byte count: */
	movq %rcx, T0_LO
	punpcklbw T0_LO, T0_LO
	punpcklbw T0_LO, T0_LO
	punpcklbw T0_LO, T0_LO
	punpcklbw T0_LO, T0_LO
	movdqa T0_LO, T0_HI
	movdqa .Lmorus640_counter_0, T1_LO
	movdqa .Lmorus640_counter_1, T1_HI
	pcmpgtb T1_LO, T0_LO
	pcmpgtb T1_HI, T0_HI
	pand T0_LO, MSG_LO
	pand T0_HI, MSG_HI

	call __morus1280_update

	/* store the state: */
	movdqu STATE0_LO, (0 * 16)(%rdi)
	movdqu STATE0_HI, (1 * 16)(%rdi)
	movdqu STATE1_LO, (2 * 16)(%rdi)
	movdqu STATE1_HI, (3 * 16)(%rdi)
	movdqu STATE2_LO, (4 * 16)(%rdi)
	movdqu STATE2_HI, (5 * 16)(%rdi)
	movdqu STATE3_LO, (6 * 16)(%rdi)
	movdqu STATE3_HI, (7 * 16)(%rdi)
	movdqu STATE4_LO, (8 * 16)(%rdi)
	movdqu STATE4_HI, (9 * 16)(%rdi)

	FRAME_END
	ret
ENDPROC(crypto_morus1280_sse2_dec_tail)

/*
 * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
 *                                  u64 assoclen, u64 cryptlen);
 */
ENTRY(crypto_morus1280_sse2_final)
	FRAME_BEGIN

	/* load the state: */
	movdqu (0 * 16)(%rdi), STATE0_LO
	movdqu (1 * 16)(%rdi), STATE0_HI
	movdqu (2 * 16)(%rdi), STATE1_LO
	movdqu (3 * 16)(%rdi), STATE1_HI
	movdqu (4 * 16)(%rdi), STATE2_LO
	movdqu (5 * 16)(%rdi), STATE2_HI
	movdqu (6 * 16)(%rdi), STATE3_LO
	movdqu (7 * 16)(%rdi), STATE3_HI
	movdqu (8 * 16)(%rdi), STATE4_LO
	movdqu (9 * 16)(%rdi), STATE4_HI

	/* xor state[0] into state[4]: */
	pxor STATE0_LO, STATE4_LO
	pxor STATE0_HI, STATE4_HI

	/* prepare length block: */
	movq %rdx, MSG_LO
	movq %rcx, T0_LO
	pslldq $8, T0_LO
	pxor T0_LO, MSG_LO
	psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
	pxor MSG_HI, MSG_HI

	/* update state: */
	call __morus1280_update
	call __morus1280_update
	call __morus1280_update
	call __morus1280_update
	call __morus1280_update
	call __morus1280_update
	call __morus1280_update
	call __morus1280_update
	call __morus1280_update
	call __morus1280_update

	/* xor tag: */
	movdqu  0(%rsi), MSG_LO
	movdqu 16(%rsi), MSG_HI

	pxor STATE0_LO, MSG_LO
	pxor STATE0_HI, MSG_HI
	movdqa STATE1_LO, T0_LO
	movdqa STATE1_HI, T0_HI
	rol3 T0_HI, T0_LO
	pxor T0_LO, MSG_LO
	pxor T0_HI, MSG_HI
	movdqa STATE2_LO, T0_LO
	movdqa STATE2_HI, T0_HI
	pand STATE3_LO, T0_LO
	pand STATE3_HI, T0_HI
	pxor T0_LO, MSG_LO
	pxor T0_HI, MSG_HI

	movdqu MSG_LO,  0(%rsi)
	movdqu MSG_HI, 16(%rsi)

	FRAME_END
	ret
ENDPROC(crypto_morus1280_sse2_final)
