arch/riscv/crypto/aes-riscv64-zvkned.S - linux - Git at Google

 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
 //
 // This file is dual-licensed, meaning that you can use it under your
 // choice of either of the following two licenses:
 //
 // Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the Apache License 2.0 (the "License"). You can obtain
 // a copy in the file LICENSE in the source distribution or at
 // https://www.openssl.org/source/license.html
 //
 // or
 //
 // Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
 // Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
 // Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
 // Copyright 2024 Google LLC
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
 // are met:
 // 1. Redistributions of source code must retain the above copyright
 //    notice, this list of conditions and the following disclaimer.
 // 2. Redistributions in binary form must reproduce the above copyright
 //    notice, this list of conditions and the following disclaimer in the
 //    documentation and/or other materials provided with the distribution.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 // The generated code of this file depends on the following RISC-V extensions:
 // - RV64I
 // - RISC-V Vector ('V') with VLEN >= 128
 // - RISC-V Vector AES block cipher extension ('Zvkned')

 #include <linux/linkage.h>

 .text
 .option arch, +zvkned

 #include "aes-macros.S"

 #define KEYP		a0
 #define INP		a1
 #define OUTP		a2
 #define LEN		a3
 #define IVP		a4

 .macro	__aes_crypt_zvkned	enc, keylen
 	vle32.v		v16, (INP)
 	aes_crypt	v16, \enc, \keylen
 	vse32.v		v16, (OUTP)
 	ret
 .endm

 .macro	aes_crypt_zvkned	enc
 	aes_begin	KEYP, 128f, 192f
 	__aes_crypt_zvkned	\enc, 256
 128:
 	__aes_crypt_zvkned	\enc, 128
 192:
 	__aes_crypt_zvkned	\enc, 192
 .endm

 // void aes_encrypt_zvkned(const struct crypto_aes_ctx *key,
 //			   const u8 in[16], u8 out[16]);
 SYM_FUNC_START(aes_encrypt_zvkned)
 	aes_crypt_zvkned	1
 SYM_FUNC_END(aes_encrypt_zvkned)

 // Same prototype and calling convention as the encryption function
 SYM_FUNC_START(aes_decrypt_zvkned)
 	aes_crypt_zvkned	0
 SYM_FUNC_END(aes_decrypt_zvkned)

 .macro	__aes_ecb_crypt	enc, keylen
 	srli		t0, LEN, 2
 	// t0 is the remaining length in 32-bit words.  It's a multiple of 4.
 1:
 	vsetvli		t1, t0, e32, m8, ta, ma
 	sub		t0, t0, t1	// Subtract number of words processed
 	slli		t1, t1, 2	// Words to bytes
 	vle32.v		v16, (INP)
 	aes_crypt	v16, \enc, \keylen
 	vse32.v		v16, (OUTP)
 	add		INP, INP, t1
 	add		OUTP, OUTP, t1
 	bnez		t0, 1b

 	ret
 .endm

 .macro	aes_ecb_crypt	enc
 	aes_begin	KEYP, 128f, 192f
 	__aes_ecb_crypt	\enc, 256
 128:
 	__aes_ecb_crypt	\enc, 128
 192:
 	__aes_ecb_crypt	\enc, 192
 .endm

 // void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key,
 //			       const u8 *in, u8 *out, size_t len);
 //
 // |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
 SYM_FUNC_START(aes_ecb_encrypt_zvkned)
 	aes_ecb_crypt	1
 SYM_FUNC_END(aes_ecb_encrypt_zvkned)

 // Same prototype and calling convention as the encryption function
 SYM_FUNC_START(aes_ecb_decrypt_zvkned)
 	aes_ecb_crypt	0
 SYM_FUNC_END(aes_ecb_decrypt_zvkned)

 .macro	aes_cbc_encrypt	keylen
 	vle32.v		v16, (IVP)	// Load IV
 1:
 	vle32.v		v17, (INP)	// Load plaintext block
 	vxor.vv		v16, v16, v17	// XOR with IV or prev ciphertext block
 	aes_encrypt	v16, \keylen	// Encrypt
 	vse32.v		v16, (OUTP)	// Store ciphertext block
 	addi		INP, INP, 16
 	addi		OUTP, OUTP, 16
 	addi		LEN, LEN, -16
 	bnez		LEN, 1b

 	vse32.v		v16, (IVP)	// Store next IV
 	ret
 .endm

 .macro	aes_cbc_decrypt	keylen
 	srli		LEN, LEN, 2	// Convert LEN from bytes to words
 	vle32.v		v16, (IVP)	// Load IV
 1:
 	vsetvli		t0, LEN, e32, m4, ta, ma
 	vle32.v		v20, (INP)	// Load ciphertext blocks
 	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
 	addi		t1, t0, -4
 	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
 	aes_decrypt	v20, \keylen	// Decrypt the blocks
 	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
 	vse32.v		v20, (OUTP)	// Store plaintext blocks
 	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
 	slli		t1, t0, 2	// Words to bytes
 	add		INP, INP, t1
 	add		OUTP, OUTP, t1
 	sub		LEN, LEN, t0
 	bnez		LEN, 1b

 	vsetivli	zero, 4, e32, m1, ta, ma
 	vse32.v		v16, (IVP)	// Store next IV
 	ret
 .endm

 // void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
 //			       const u8 *in, u8 *out, size_t len, u8 iv[16]);
 //
 // |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
 SYM_FUNC_START(aes_cbc_encrypt_zvkned)
 	aes_begin	KEYP, 128f, 192f
 	aes_cbc_encrypt	256
 128:
 	aes_cbc_encrypt	128
 192:
 	aes_cbc_encrypt	192
 SYM_FUNC_END(aes_cbc_encrypt_zvkned)

 // Same prototype and calling convention as the encryption function
 SYM_FUNC_START(aes_cbc_decrypt_zvkned)
 	aes_begin	KEYP, 128f, 192f
 	aes_cbc_decrypt	256
 128:
 	aes_cbc_decrypt	128
 192:
 	aes_cbc_decrypt	192
 SYM_FUNC_END(aes_cbc_decrypt_zvkned)

 .macro	aes_cbc_cts_encrypt	keylen

 	// CBC-encrypt all blocks except the last.  But don't store the
 	// second-to-last block to the output buffer yet, since it will be
 	// handled specially in the ciphertext stealing step.  Exception: if the
 	// message is single-block, still encrypt the last (and only) block.
 	li		t0, 16
 	j		2f
 1:
 	vse32.v		v16, (OUTP)	// Store ciphertext block
 	addi		OUTP, OUTP, 16
 2:
 	vle32.v		v17, (INP)	// Load plaintext block
 	vxor.vv		v16, v16, v17	// XOR with IV or prev ciphertext block
 	aes_encrypt	v16, \keylen	// Encrypt
 	addi		INP, INP, 16
 	addi		LEN, LEN, -16
 	bgt		LEN, t0, 1b	// Repeat if more than one block remains

 	// Special case: if the message is a single block, just do CBC.
 	beqz		LEN, .Lcts_encrypt_done\@

 	// Encrypt the last two blocks using ciphertext stealing as follows:
 	//	C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n])
 	//	C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN]
 	//
 	// C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th
 	// plaintext block.  Block n, the last block, may be partial; its length
 	// is 1 <= LEN <= 16.  If there are only 2 blocks, C[n-2] means the IV.
 	//
 	// v16 already contains Encrypt(P[n-1] ^ C[n-2]).
 	// INP points to P[n].  OUTP points to where C[n-1] should go.
 	// To support in-place encryption, load P[n] before storing C[n].
 	addi		t0, OUTP, 16	// Get pointer to where C[n] should go
 	vsetvli		zero, LEN, e8, m1, tu, ma
 	vle8.v		v17, (INP)	// Load P[n]
 	vse8.v		v16, (t0)	// Store C[n]
 	vxor.vv		v16, v16, v17	// v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n]
 	vsetivli	zero, 4, e32, m1, ta, ma
 	aes_encrypt	v16, \keylen
 .Lcts_encrypt_done\@:
 	vse32.v		v16, (OUTP)	// Store C[n-1] (or C[n] in single-block case)
 	ret
 .endm

 #define LEN32		t4 // Length of remaining full blocks in 32-bit words
 #define LEN_MOD16	t5 // Length of message in bytes mod 16

 .macro	aes_cbc_cts_decrypt	keylen
 	andi		LEN32, LEN, ~15
 	srli		LEN32, LEN32, 2
 	andi		LEN_MOD16, LEN, 15

 	// Save C[n-2] in v28 so that it's available later during the ciphertext
 	// stealing step.  If there are fewer than three blocks, C[n-2] means
 	// the IV, otherwise it means the third-to-last ciphertext block.
 	vmv.v.v		v28, v16	// IV
 	add		t0, LEN, -33
 	bltz		t0, .Lcts_decrypt_loop\@
 	andi		t0, t0, ~15
 	add		t0, t0, INP
 	vle32.v		v28, (t0)

 	// CBC-decrypt all full blocks.  For the last full block, or the last 2
 	// full blocks if the message is block-aligned, this doesn't write the
 	// correct output blocks (unless the message is only a single block),
 	// because it XORs the wrong values with the raw AES plaintexts.  But we
 	// fix this after this loop without redoing the AES decryptions.  This
 	// approach allows more of the AES decryptions to be parallelized.
 .Lcts_decrypt_loop\@:
 	vsetvli		t0, LEN32, e32, m4, ta, ma
 	addi		t1, t0, -4
 	vle32.v		v20, (INP)	// Load next set of ciphertext blocks
 	vmv.v.v		v24, v16	// Get IV or last ciphertext block of prev set
 	vslideup.vi	v24, v20, 4	// Setup prev ciphertext blocks
 	vslidedown.vx	v16, v20, t1	// Save last ciphertext block of this set
 	aes_decrypt	v20, \keylen	// Decrypt this set of blocks
 	vxor.vv		v24, v24, v20	// XOR prev ciphertext blocks with decrypted blocks
 	vse32.v		v24, (OUTP)	// Store this set of plaintext blocks
 	sub		LEN32, LEN32, t0
 	slli		t0, t0, 2	// Words to bytes
 	add		INP, INP, t0
 	add		OUTP, OUTP, t0
 	bnez		LEN32, .Lcts_decrypt_loop\@

 	vsetivli	zero, 4, e32, m4, ta, ma
 	vslidedown.vx	v20, v20, t1	// Extract raw plaintext of last full block
 	addi		t0, OUTP, -16	// Get pointer to last full plaintext block
 	bnez		LEN_MOD16, .Lcts_decrypt_non_block_aligned\@

 	// Special case: if the message is a single block, just do CBC.
 	li		t1, 16
 	beq		LEN, t1, .Lcts_decrypt_done\@

 	// Block-aligned message.  Just fix up the last 2 blocks.  We need:
 	//
 	//	P[n-1] = Decrypt(C[n]) ^ C[n-2]
 	//	P[n] = Decrypt(C[n-1]) ^ C[n]
 	//
 	// We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28.
 	// Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this
 	// is everything needed to fix the output without re-decrypting blocks.
 	addi		t1, OUTP, -32	// Get pointer to where P[n-1] should go
 	vxor.vv		v20, v20, v28	// Decrypt(C[n]) ^ C[n-2] == P[n-1]
 	vle32.v		v24, (t1)	// Decrypt(C[n-1]) ^ C[n-2]
 	vse32.v		v20, (t1)	// Store P[n-1]
 	vxor.vv		v20, v24, v16	// Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2]
 	j		.Lcts_decrypt_finish\@

 .Lcts_decrypt_non_block_aligned\@:
 	// Decrypt the last two blocks using ciphertext stealing as follows:
 	//
 	//	P[n-1] = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2]
 	//	P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16]
 	//
 	// We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28.
 	vmv.v.v		v16, v20	// v16 = Decrypt(C[n-1])
 	vsetvli		zero, LEN_MOD16, e8, m1, tu, ma
 	vle8.v		v20, (INP)	// v20 = C[n] || Decrypt(C[n-1])[LEN_MOD16..16]
 	vxor.vv		v16, v16, v20	// v16 = Decrypt(C[n-1]) ^ C[n]
 	vse8.v		v16, (OUTP)	// Store P[n]
 	vsetivli	zero, 4, e32, m1, ta, ma
 	aes_decrypt	v20, \keylen	// v20 = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16])
 .Lcts_decrypt_finish\@:
 	vxor.vv		v20, v20, v28	// XOR with C[n-2]
 	vse32.v		v20, (t0)	// Store last full plaintext block
 .Lcts_decrypt_done\@:
 	ret
 .endm

 .macro	aes_cbc_cts_crypt	keylen
 	vle32.v		v16, (IVP)	// Load IV
 	beqz		a5, .Lcts_decrypt\@
 	aes_cbc_cts_encrypt \keylen
 .Lcts_decrypt\@:
 	aes_cbc_cts_decrypt \keylen
 .endm

 // void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
 //			         const u8 *in, u8 *out, size_t len,
 //				 const u8 iv[16], bool enc);
 //
 // Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS.
 // This is the variant that unconditionally swaps the last two blocks.
 SYM_FUNC_START(aes_cbc_cts_crypt_zvkned)
 	aes_begin	KEYP, 128f, 192f
 	aes_cbc_cts_crypt 256
 128:
 	aes_cbc_cts_crypt 128
 192:
 	aes_cbc_cts_crypt 192
 SYM_FUNC_END(aes_cbc_cts_crypt_zvkned)
	/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
	//
	// This file is dual-licensed, meaning that you can use it under your
	// choice of either of the following two licenses:
	//
	// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
	//
	// Licensed under the Apache License 2.0 (the "License"). You can obtain
	// a copy in the file LICENSE in the source distribution or at
	// https://www.openssl.org/source/license.html
	//
	// or
	//
	// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
	// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
	// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
	// Copyright 2024 Google LLC
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions
	// are met:
	// 1. Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// 2. Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	// The generated code of this file depends on the following RISC-V extensions:
	// - RV64I
	// - RISC-V Vector ('V') with VLEN >= 128
	// - RISC-V Vector AES block cipher extension ('Zvkned')

	#include <linux/linkage.h>

	.text
	.option arch, +zvkned

	#include "aes-macros.S"

	#define KEYP a0
	#define INP a1
	#define OUTP a2
	#define LEN a3
	#define IVP a4

	.macro __aes_crypt_zvkned enc, keylen
	vle32.v v16, (INP)
	aes_crypt v16, \enc, \keylen
	vse32.v v16, (OUTP)
	ret
	.endm

	.macro aes_crypt_zvkned enc
	aes_begin KEYP, 128f, 192f
	__aes_crypt_zvkned \enc, 256
	128:
	__aes_crypt_zvkned \enc, 128
	192:
	__aes_crypt_zvkned \enc, 192
	.endm

	// void aes_encrypt_zvkned(const struct crypto_aes_ctx *key,
	// const u8 in[16], u8 out[16]);
	SYM_FUNC_START(aes_encrypt_zvkned)
	aes_crypt_zvkned 1
	SYM_FUNC_END(aes_encrypt_zvkned)

	// Same prototype and calling convention as the encryption function
	SYM_FUNC_START(aes_decrypt_zvkned)
	aes_crypt_zvkned 0
	SYM_FUNC_END(aes_decrypt_zvkned)

	.macro __aes_ecb_crypt enc, keylen
	srli t0, LEN, 2
	// t0 is the remaining length in 32-bit words. It's a multiple of 4.
	1:
	vsetvli t1, t0, e32, m8, ta, ma
	sub t0, t0, t1 // Subtract number of words processed
	slli t1, t1, 2 // Words to bytes
	vle32.v v16, (INP)
	aes_crypt v16, \enc, \keylen
	vse32.v v16, (OUTP)
	add INP, INP, t1
	add OUTP, OUTP, t1
	bnez t0, 1b

	ret
	.endm

	.macro aes_ecb_crypt enc
	aes_begin KEYP, 128f, 192f
	__aes_ecb_crypt \enc, 256
	128:
	__aes_ecb_crypt \enc, 128
	192:
	__aes_ecb_crypt \enc, 192
	.endm

	// void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key,
	// const u8 in, u8 out, size_t len);
	//
	// \|len\| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
	SYM_FUNC_START(aes_ecb_encrypt_zvkned)
	aes_ecb_crypt 1
	SYM_FUNC_END(aes_ecb_encrypt_zvkned)

	// Same prototype and calling convention as the encryption function
	SYM_FUNC_START(aes_ecb_decrypt_zvkned)
	aes_ecb_crypt 0
	SYM_FUNC_END(aes_ecb_decrypt_zvkned)

	.macro aes_cbc_encrypt keylen
	vle32.v v16, (IVP) // Load IV
	1:
	vle32.v v17, (INP) // Load plaintext block
	vxor.vv v16, v16, v17 // XOR with IV or prev ciphertext block
	aes_encrypt v16, \keylen // Encrypt
	vse32.v v16, (OUTP) // Store ciphertext block
	addi INP, INP, 16
	addi OUTP, OUTP, 16
	addi LEN, LEN, -16
	bnez LEN, 1b

	vse32.v v16, (IVP) // Store next IV
	ret
	.endm

	.macro aes_cbc_decrypt keylen
	srli LEN, LEN, 2 // Convert LEN from bytes to words
	vle32.v v16, (IVP) // Load IV
	1:
	vsetvli t0, LEN, e32, m4, ta, ma
	vle32.v v20, (INP) // Load ciphertext blocks
	vslideup.vi v16, v20, 4 // Setup prev ciphertext blocks
	addi t1, t0, -4
	vslidedown.vx v24, v20, t1 // Save last ciphertext block
	aes_decrypt v20, \keylen // Decrypt the blocks
	vxor.vv v20, v20, v16 // XOR with prev ciphertext blocks
	vse32.v v20, (OUTP) // Store plaintext blocks
	vmv.v.v v16, v24 // Next "IV" is last ciphertext block
	slli t1, t0, 2 // Words to bytes
	add INP, INP, t1
	add OUTP, OUTP, t1
	sub LEN, LEN, t0
	bnez LEN, 1b

	vsetivli zero, 4, e32, m1, ta, ma
	vse32.v v16, (IVP) // Store next IV
	ret
	.endm

	// void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
	// const u8 in, u8 out, size_t len, u8 iv[16]);
	//
	// \|len\| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
	SYM_FUNC_START(aes_cbc_encrypt_zvkned)
	aes_begin KEYP, 128f, 192f
	aes_cbc_encrypt 256
	128:
	aes_cbc_encrypt 128
	192:
	aes_cbc_encrypt 192
	SYM_FUNC_END(aes_cbc_encrypt_zvkned)

	// Same prototype and calling convention as the encryption function
	SYM_FUNC_START(aes_cbc_decrypt_zvkned)
	aes_begin KEYP, 128f, 192f
	aes_cbc_decrypt 256
	128:
	aes_cbc_decrypt 128
	192:
	aes_cbc_decrypt 192
	SYM_FUNC_END(aes_cbc_decrypt_zvkned)

	.macro aes_cbc_cts_encrypt keylen

	// CBC-encrypt all blocks except the last. But don't store the
	// second-to-last block to the output buffer yet, since it will be
	// handled specially in the ciphertext stealing step. Exception: if the
	// message is single-block, still encrypt the last (and only) block.
	li t0, 16
	j 2f
	1:
	vse32.v v16, (OUTP) // Store ciphertext block
	addi OUTP, OUTP, 16
	2:
	vle32.v v17, (INP) // Load plaintext block
	vxor.vv v16, v16, v17 // XOR with IV or prev ciphertext block
	aes_encrypt v16, \keylen // Encrypt
	addi INP, INP, 16
	addi LEN, LEN, -16
	bgt LEN, t0, 1b // Repeat if more than one block remains

	// Special case: if the message is a single block, just do CBC.
	beqz LEN, .Lcts_encrypt_done\@

	// Encrypt the last two blocks using ciphertext stealing as follows:
	// C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n])
	// C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN]
	//
	// C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th
	// plaintext block. Block n, the last block, may be partial; its length
	// is 1 <= LEN <= 16. If there are only 2 blocks, C[n-2] means the IV.
	//
	// v16 already contains Encrypt(P[n-1] ^ C[n-2]).
	// INP points to P[n]. OUTP points to where C[n-1] should go.
	// To support in-place encryption, load P[n] before storing C[n].
	addi t0, OUTP, 16 // Get pointer to where C[n] should go
	vsetvli zero, LEN, e8, m1, tu, ma
	vle8.v v17, (INP) // Load P[n]
	vse8.v v16, (t0) // Store C[n]
	vxor.vv v16, v16, v17 // v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n]
	vsetivli zero, 4, e32, m1, ta, ma
	aes_encrypt v16, \keylen
	.Lcts_encrypt_done\@:
	vse32.v v16, (OUTP) // Store C[n-1] (or C[n] in single-block case)
	ret
	.endm

	#define LEN32 t4 // Length of remaining full blocks in 32-bit words
	#define LEN_MOD16 t5 // Length of message in bytes mod 16

	.macro aes_cbc_cts_decrypt keylen
	andi LEN32, LEN, ~15
	srli LEN32, LEN32, 2
	andi LEN_MOD16, LEN, 15

	// Save C[n-2] in v28 so that it's available later during the ciphertext
	// stealing step. If there are fewer than three blocks, C[n-2] means
	// the IV, otherwise it means the third-to-last ciphertext block.
	vmv.v.v v28, v16 // IV
	add t0, LEN, -33
	bltz t0, .Lcts_decrypt_loop\@
	andi t0, t0, ~15
	add t0, t0, INP
	vle32.v v28, (t0)

	// CBC-decrypt all full blocks. For the last full block, or the last 2
	// full blocks if the message is block-aligned, this doesn't write the
	// correct output blocks (unless the message is only a single block),
	// because it XORs the wrong values with the raw AES plaintexts. But we
	// fix this after this loop without redoing the AES decryptions. This
	// approach allows more of the AES decryptions to be parallelized.
	.Lcts_decrypt_loop\@:
	vsetvli t0, LEN32, e32, m4, ta, ma
	addi t1, t0, -4
	vle32.v v20, (INP) // Load next set of ciphertext blocks
	vmv.v.v v24, v16 // Get IV or last ciphertext block of prev set
	vslideup.vi v24, v20, 4 // Setup prev ciphertext blocks
	vslidedown.vx v16, v20, t1 // Save last ciphertext block of this set
	aes_decrypt v20, \keylen // Decrypt this set of blocks
	vxor.vv v24, v24, v20 // XOR prev ciphertext blocks with decrypted blocks
	vse32.v v24, (OUTP) // Store this set of plaintext blocks
	sub LEN32, LEN32, t0
	slli t0, t0, 2 // Words to bytes
	add INP, INP, t0
	add OUTP, OUTP, t0
	bnez LEN32, .Lcts_decrypt_loop\@

	vsetivli zero, 4, e32, m4, ta, ma
	vslidedown.vx v20, v20, t1 // Extract raw plaintext of last full block
	addi t0, OUTP, -16 // Get pointer to last full plaintext block
	bnez LEN_MOD16, .Lcts_decrypt_non_block_aligned\@

	// Special case: if the message is a single block, just do CBC.
	li t1, 16
	beq LEN, t1, .Lcts_decrypt_done\@

	// Block-aligned message. Just fix up the last 2 blocks. We need:
	//
	// P[n-1] = Decrypt(C[n]) ^ C[n-2]
	// P[n] = Decrypt(C[n-1]) ^ C[n]
	//
	// We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28.
	// Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this
	// is everything needed to fix the output without re-decrypting blocks.
	addi t1, OUTP, -32 // Get pointer to where P[n-1] should go
	vxor.vv v20, v20, v28 // Decrypt(C[n]) ^ C[n-2] == P[n-1]
	vle32.v v24, (t1) // Decrypt(C[n-1]) ^ C[n-2]
	vse32.v v20, (t1) // Store P[n-1]
	vxor.vv v20, v24, v16 // Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2]
	j .Lcts_decrypt_finish\@

	.Lcts_decrypt_non_block_aligned\@:
	// Decrypt the last two blocks using ciphertext stealing as follows:
	//
	// P[n-1] = Decrypt(C[n] \|\| Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2]
	// P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16]
	//
	// We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28.
	vmv.v.v v16, v20 // v16 = Decrypt(C[n-1])
	vsetvli zero, LEN_MOD16, e8, m1, tu, ma
	vle8.v v20, (INP) // v20 = C[n] \|\| Decrypt(C[n-1])[LEN_MOD16..16]
	vxor.vv v16, v16, v20 // v16 = Decrypt(C[n-1]) ^ C[n]
	vse8.v v16, (OUTP) // Store P[n]
	vsetivli zero, 4, e32, m1, ta, ma
	aes_decrypt v20, \keylen // v20 = Decrypt(C[n] \|\| Decrypt(C[n-1])[LEN_MOD16..16])
	.Lcts_decrypt_finish\@:
	vxor.vv v20, v20, v28 // XOR with C[n-2]
	vse32.v v20, (t0) // Store last full plaintext block
	.Lcts_decrypt_done\@:
	ret
	.endm

	.macro aes_cbc_cts_crypt keylen
	vle32.v v16, (IVP) // Load IV
	beqz a5, .Lcts_decrypt\@
	aes_cbc_cts_encrypt \keylen
	.Lcts_decrypt\@:
	aes_cbc_cts_decrypt \keylen
	.endm

	// void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
	// const u8 in, u8 out, size_t len,
	// const u8 iv[16], bool enc);
	//
	// Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS.
	// This is the variant that unconditionally swaps the last two blocks.
	SYM_FUNC_START(aes_cbc_cts_crypt_zvkned)
	aes_begin KEYP, 128f, 192f
	aes_cbc_cts_crypt 256
	128:
	aes_cbc_cts_crypt 128
	192:
	aes_cbc_cts_crypt 192
	SYM_FUNC_END(aes_cbc_cts_crypt_zvkned)