arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S - linux - Git at Google

 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
 //
 // This file is dual-licensed, meaning that you can use it under your
 // choice of either of the following two licenses:
 //
 // Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the Apache License 2.0 (the "License"). You can obtain
 // a copy in the file LICENSE in the source distribution or at
 // https://www.openssl.org/source/license.html
 //
 // or
 //
 // Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
 // Copyright 2024 Google LLC
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
 // are met:
 // 1. Redistributions of source code must retain the above copyright
 //    notice, this list of conditions and the following disclaimer.
 // 2. Redistributions in binary form must reproduce the above copyright
 //    notice, this list of conditions and the following disclaimer in the
 //    documentation and/or other materials provided with the distribution.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 // The generated code of this file depends on the following RISC-V extensions:
 // - RV64I
 // - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
 // - RISC-V Vector AES block cipher extension ('Zvkned')
 // - RISC-V Vector Bit-manipulation extension ('Zvbb')
 // - RISC-V Vector GCM/GMAC extension ('Zvkg')

 #include <linux/linkage.h>

 .text
 .option arch, +zvkned, +zvbb, +zvkg

 #include "aes-macros.S"

 #define KEYP		a0
 #define INP		a1
 #define OUTP		a2
 #define LEN		a3
 #define TWEAKP		a4

 #define LEN32		a5
 #define TAIL_LEN	a6
 #define VL		a7
 #define VLMAX		t4

 // v1-v15 contain the AES round keys, but they are used for temporaries before
 // the AES round keys have been loaded.
 #define TWEAKS		v16	// LMUL=4 (most of the time)
 #define TWEAKS_BREV	v20	// LMUL=4 (most of the time)
 #define MULTS_BREV	v24	// LMUL=4 (most of the time)
 #define TMP0		v28
 #define TMP1		v29
 #define TMP2		v30
 #define TMP3		v31

 // xts_init initializes the following values:
 //
 //	TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
 //	TWEAKS_BREV: same as TWEAKS, but bit-reversed
 //	MULTS_BREV: N 128-bit values x^N, bit-reversed.  Only if N > 1.
 //
 // N is the maximum number of blocks that will be processed per loop iteration,
 // computed using vsetvli.
 //
 // The field convention used by XTS is the same as that of GHASH, but with the
 // bits reversed within each byte.  The zvkg extension provides the vgmul
 // instruction which does multiplication in this field.  Therefore, for tweak
 // computation we use vgmul to do multiplications in parallel, instead of
 // serially multiplying by x using shifting+xoring.  Note that for this to work,
 // the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
 .macro	xts_init

 	// Load the first tweak T.
 	vsetivli	zero, 4, e32, m1, ta, ma
 	vle32.v		TWEAKS, (TWEAKP)

 	// If there's only one block (or no blocks at all), then skip the tweak
 	// sequence computation because (at most) T itself is needed.
 	li		t0, 16
 	ble		LEN, t0, .Linit_single_block\@

 	// Save a copy of T bit-reversed in v12.
 	vbrev8.v	v12, TWEAKS

 	//
 	// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
 	// that N <= 128.  Though, this code actually requires N < 64 (or
 	// equivalently VLEN < 2048) due to the use of 64-bit intermediate
 	// values here and in the x^N computation later.
 	//
 	vsetvli		VL, LEN32, e32, m4, ta, ma
 	srli		t0, VL, 2	// t0 = N (num blocks)
 	// Generate two sequences, each with N 32-bit values:
 	// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
 	vsetvli		zero, t0, e32, m1, ta, ma
 	vmv.v.i		v0, 1
 	vid.v		v1
 	// Use vzext to zero-extend the sequences to 64 bits.  Reinterpret them
 	// as two sequences, each with 2*N 32-bit values:
 	// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
 	vsetvli		zero, t0, e64, m2, ta, ma
 	vzext.vf2	v2, v0
 	vzext.vf2	v4, v1
 	slli		t1, t0, 1	// t1 = 2*N
 	vsetvli		zero, t1, e32, m2, ta, ma
 	// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
 	// widening to 64 bits per element.  When reinterpreted as N 128-bit
 	// values, this is the needed sequence of 128-bit values 1 << i (x^i).
 	vwsll.vv	v8, v2, v4

 	// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
 	// multiply by x^i.  This gives the sequence T*(x^i), bit-reversed.
 	vsetvli		zero, LEN32, e32, m4, ta, ma
 	vmv.v.i		TWEAKS_BREV, 0
 	vaesz.vs	TWEAKS_BREV, v12
 	vbrev8.v	v8, v8
 	vgmul.vv	TWEAKS_BREV, v8

 	// Save a copy of the sequence T*(x^i) with the bit reversal undone.
 	vbrev8.v	TWEAKS, TWEAKS_BREV

 	// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
 	li		t1, 1
 	sll		t1, t1, t0	// t1 = 1 << N
 	vsetivli	zero, 2, e64, m1, ta, ma
 	vmv.v.i		v0, 0
 	vsetivli	zero, 1, e64, m1, tu, ma
 	vmv.v.x		v0, t1
 	vbrev8.v	v0, v0
 	vsetvli		zero, LEN32, e32, m4, ta, ma
 	vmv.v.i		MULTS_BREV, 0
 	vaesz.vs	MULTS_BREV, v0

 	j		.Linit_done\@

 .Linit_single_block\@:
 	vbrev8.v	TWEAKS_BREV, TWEAKS
 .Linit_done\@:
 .endm

 // Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed.  This is
 // the multiplier required to advance the tweak by one.
 .macro	load_x
 	li		t0, 0x40
 	vsetivli	zero, 4, e32, m1, ta, ma
 	vmv.v.i		MULTS_BREV, 0
 	vsetivli	zero, 1, e8, m1, tu, ma
 	vmv.v.x		MULTS_BREV, t0
 .endm

 .macro	__aes_xts_crypt	enc, keylen
 	// With 16 < len <= 31, there's no main loop, just ciphertext stealing.
 	beqz		LEN32, .Lcts_without_main_loop\@

 	vsetvli		VLMAX, zero, e32, m4, ta, ma
 1:
 	vsetvli		VL, LEN32, e32, m4, ta, ma
 2:
 	// Encrypt or decrypt VL/4 blocks.
 	vle32.v		TMP0, (INP)
 	vxor.vv		TMP0, TMP0, TWEAKS
 	aes_crypt	TMP0, \enc, \keylen
 	vxor.vv		TMP0, TMP0, TWEAKS
 	vse32.v		TMP0, (OUTP)

 	// Update the pointers and the remaining length.
 	slli		t0, VL, 2
 	add		INP, INP, t0
 	add		OUTP, OUTP, t0
 	sub		LEN32, LEN32, VL

 	// Check whether more blocks remain.
 	beqz		LEN32, .Lmain_loop_done\@

 	// Compute the next sequence of tweaks by multiplying the previous
 	// sequence by x^N.  Store the result in both bit-reversed order and
 	// regular order (i.e. with the bit reversal undone).
 	vgmul.vv	TWEAKS_BREV, MULTS_BREV
 	vbrev8.v	TWEAKS, TWEAKS_BREV

 	// Since we compute the tweak multipliers x^N in advance, we require
 	// that each iteration process the same length except possibly the last.
 	// This conflicts slightly with the behavior allowed by RISC-V Vector
 	// Extension, where CPUs can select a lower length for both of the last
 	// two iterations.  E.g., vl might take the sequence of values
 	// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
 	// can use x^4 again instead of computing x^3.  Therefore, we explicitly
 	// keep the vl at VLMAX if there is at least VLMAX remaining.
 	bge		LEN32, VLMAX, 2b
 	j		1b

 .Lmain_loop_done\@:
 	load_x

 	// Compute the next tweak.
 	addi		t0, VL, -4
 	vsetivli	zero, 4, e32, m4, ta, ma
 	vslidedown.vx	TWEAKS_BREV, TWEAKS_BREV, t0	// Extract last tweak
 	vsetivli	zero, 4, e32, m1, ta, ma
 	vgmul.vv	TWEAKS_BREV, MULTS_BREV		// Advance to next tweak

 	bnez		TAIL_LEN, .Lcts\@

 	// Update *TWEAKP to contain the next tweak.
 	vbrev8.v	TWEAKS, TWEAKS_BREV
 	vse32.v		TWEAKS, (TWEAKP)
 	ret

 .Lcts_without_main_loop\@:
 	load_x
 .Lcts\@:
 	// TWEAKS_BREV now contains the next tweak.  Compute the one after that.
 	vsetivli	zero, 4, e32, m1, ta, ma
 	vmv.v.v		TMP0, TWEAKS_BREV
 	vgmul.vv	TMP0, MULTS_BREV
 	// Undo the bit reversal of the next two tweaks and store them in TMP1
 	// and TMP2, such that TMP1 is the first needed and TMP2 the second.
 .if \enc
 	vbrev8.v	TMP1, TWEAKS_BREV
 	vbrev8.v	TMP2, TMP0
 .else
 	vbrev8.v	TMP1, TMP0
 	vbrev8.v	TMP2, TWEAKS_BREV
 .endif

 	// Encrypt/decrypt the last full block.
 	vle32.v		TMP0, (INP)
 	vxor.vv		TMP0, TMP0, TMP1
 	aes_crypt	TMP0, \enc, \keylen
 	vxor.vv		TMP0, TMP0, TMP1

 	// Swap the first TAIL_LEN bytes of the above result with the tail.
 	// Note that to support in-place encryption/decryption, the load from
 	// the input tail must happen before the store to the output tail.
 	addi		t0, INP, 16
 	addi		t1, OUTP, 16
 	vmv.v.v		TMP3, TMP0
 	vsetvli		zero, TAIL_LEN, e8, m1, tu, ma
 	vle8.v		TMP0, (t0)
 	vse8.v		TMP3, (t1)

 	// Encrypt/decrypt again and store the last full block.
 	vsetivli	zero, 4, e32, m1, ta, ma
 	vxor.vv		TMP0, TMP0, TMP2
 	aes_crypt	TMP0, \enc, \keylen
 	vxor.vv		TMP0, TMP0, TMP2
 	vse32.v		TMP0, (OUTP)

 	ret
 .endm

 .macro	aes_xts_crypt	enc

 	// Check whether the length is a multiple of the AES block size.
 	andi		TAIL_LEN, LEN, 15
 	beqz		TAIL_LEN, 1f

 	// The length isn't a multiple of the AES block size, so ciphertext
 	// stealing will be required.  Ciphertext stealing involves special
 	// handling of the partial block and the last full block, so subtract
 	// the length of both from the length to be processed in the main loop.
 	sub		LEN, LEN, TAIL_LEN
 	addi		LEN, LEN, -16
 1:
 	srli		LEN32, LEN, 2
 	// LEN and LEN32 now contain the total length of the blocks that will be
 	// processed in the main loop, in bytes and 32-bit words respectively.

 	xts_init
 	aes_begin	KEYP, 128f, 192f
 	__aes_xts_crypt	\enc, 256
 128:
 	__aes_xts_crypt	\enc, 128
 192:
 	__aes_xts_crypt	\enc, 192
 .endm

 // void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
 //					 const u8 *in, u8 *out, size_t len,
 //					 u8 tweak[16]);
 //
 // |key| is the data key.  |tweak| contains the next tweak; the encryption of
 // the original IV with the tweak key was already done.  This function supports
 // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
 // |len| must be a multiple of 16 except on the last call.  If |len| is a
 // multiple of 16, then this function updates |tweak| to contain the next tweak.
 SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
 	aes_xts_crypt	1
 SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)

 // Same prototype and calling convention as the encryption function
 SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
 	aes_xts_crypt	0
 SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)
	/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
	//
	// This file is dual-licensed, meaning that you can use it under your
	// choice of either of the following two licenses:
	//
	// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
	//
	// Licensed under the Apache License 2.0 (the "License"). You can obtain
	// a copy in the file LICENSE in the source distribution or at
	// https://www.openssl.org/source/license.html
	//
	// or
	//
	// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
	// Copyright 2024 Google LLC
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions
	// are met:
	// 1. Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// 2. Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	// The generated code of this file depends on the following RISC-V extensions:
	// - RV64I
	// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
	// - RISC-V Vector AES block cipher extension ('Zvkned')
	// - RISC-V Vector Bit-manipulation extension ('Zvbb')
	// - RISC-V Vector GCM/GMAC extension ('Zvkg')

	#include <linux/linkage.h>

	.text
	.option arch, +zvkned, +zvbb, +zvkg

	#include "aes-macros.S"

	#define KEYP a0
	#define INP a1
	#define OUTP a2
	#define LEN a3
	#define TWEAKP a4

	#define LEN32 a5
	#define TAIL_LEN a6
	#define VL a7
	#define VLMAX t4

	// v1-v15 contain the AES round keys, but they are used for temporaries before
	// the AES round keys have been loaded.
	#define TWEAKS v16 // LMUL=4 (most of the time)
	#define TWEAKS_BREV v20 // LMUL=4 (most of the time)
	#define MULTS_BREV v24 // LMUL=4 (most of the time)
	#define TMP0 v28
	#define TMP1 v29
	#define TMP2 v30
	#define TMP3 v31

	// xts_init initializes the following values:
	//
	// TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
	// TWEAKS_BREV: same as TWEAKS, but bit-reversed
	// MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.
	//
	// N is the maximum number of blocks that will be processed per loop iteration,
	// computed using vsetvli.
	//
	// The field convention used by XTS is the same as that of GHASH, but with the
	// bits reversed within each byte. The zvkg extension provides the vgmul
	// instruction which does multiplication in this field. Therefore, for tweak
	// computation we use vgmul to do multiplications in parallel, instead of
	// serially multiplying by x using shifting+xoring. Note that for this to work,
	// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
	.macro xts_init

	// Load the first tweak T.
	vsetivli zero, 4, e32, m1, ta, ma
	vle32.v TWEAKS, (TWEAKP)

	// If there's only one block (or no blocks at all), then skip the tweak
	// sequence computation because (at most) T itself is needed.
	li t0, 16
	ble LEN, t0, .Linit_single_block\@

	// Save a copy of T bit-reversed in v12.
	vbrev8.v v12, TWEAKS

	//
	// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
	// that N <= 128. Though, this code actually requires N < 64 (or
	// equivalently VLEN < 2048) due to the use of 64-bit intermediate
	// values here and in the x^N computation later.
	//
	vsetvli VL, LEN32, e32, m4, ta, ma
	srli t0, VL, 2 // t0 = N (num blocks)
	// Generate two sequences, each with N 32-bit values:
	// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
	vsetvli zero, t0, e32, m1, ta, ma
	vmv.v.i v0, 1
	vid.v v1
	// Use vzext to zero-extend the sequences to 64 bits. Reinterpret them
	// as two sequences, each with 2*N 32-bit values:
	// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
	vsetvli zero, t0, e64, m2, ta, ma
	vzext.vf2 v2, v0
	vzext.vf2 v4, v1
	slli t1, t0, 1 // t1 = 2*N
	vsetvli zero, t1, e32, m2, ta, ma
	// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
	// widening to 64 bits per element. When reinterpreted as N 128-bit
	// values, this is the needed sequence of 128-bit values 1 << i (x^i).
	vwsll.vv v8, v2, v4

	// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
	// multiply by x^i. This gives the sequence T*(x^i), bit-reversed.
	vsetvli zero, LEN32, e32, m4, ta, ma
	vmv.v.i TWEAKS_BREV, 0
	vaesz.vs TWEAKS_BREV, v12
	vbrev8.v v8, v8
	vgmul.vv TWEAKS_BREV, v8

	// Save a copy of the sequence T*(x^i) with the bit reversal undone.
	vbrev8.v TWEAKS, TWEAKS_BREV

	// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
	li t1, 1
	sll t1, t1, t0 // t1 = 1 << N
	vsetivli zero, 2, e64, m1, ta, ma
	vmv.v.i v0, 0
	vsetivli zero, 1, e64, m1, tu, ma
	vmv.v.x v0, t1
	vbrev8.v v0, v0
	vsetvli zero, LEN32, e32, m4, ta, ma
	vmv.v.i MULTS_BREV, 0
	vaesz.vs MULTS_BREV, v0

	j .Linit_done\@

	.Linit_single_block\@:
	vbrev8.v TWEAKS_BREV, TWEAKS
	.Linit_done\@:
	.endm

	// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is
	// the multiplier required to advance the tweak by one.
	.macro load_x
	li t0, 0x40
	vsetivli zero, 4, e32, m1, ta, ma
	vmv.v.i MULTS_BREV, 0
	vsetivli zero, 1, e8, m1, tu, ma
	vmv.v.x MULTS_BREV, t0
	.endm

	.macro __aes_xts_crypt enc, keylen
	// With 16 < len <= 31, there's no main loop, just ciphertext stealing.
	beqz LEN32, .Lcts_without_main_loop\@

	vsetvli VLMAX, zero, e32, m4, ta, ma
	1:
	vsetvli VL, LEN32, e32, m4, ta, ma
	2:
	// Encrypt or decrypt VL/4 blocks.
	vle32.v TMP0, (INP)
	vxor.vv TMP0, TMP0, TWEAKS
	aes_crypt TMP0, \enc, \keylen
	vxor.vv TMP0, TMP0, TWEAKS
	vse32.v TMP0, (OUTP)

	// Update the pointers and the remaining length.
	slli t0, VL, 2
	add INP, INP, t0
	add OUTP, OUTP, t0
	sub LEN32, LEN32, VL

	// Check whether more blocks remain.
	beqz LEN32, .Lmain_loop_done\@

	// Compute the next sequence of tweaks by multiplying the previous
	// sequence by x^N. Store the result in both bit-reversed order and
	// regular order (i.e. with the bit reversal undone).
	vgmul.vv TWEAKS_BREV, MULTS_BREV
	vbrev8.v TWEAKS, TWEAKS_BREV

	// Since we compute the tweak multipliers x^N in advance, we require
	// that each iteration process the same length except possibly the last.
	// This conflicts slightly with the behavior allowed by RISC-V Vector
	// Extension, where CPUs can select a lower length for both of the last
	// two iterations. E.g., vl might take the sequence of values
	// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
	// can use x^4 again instead of computing x^3. Therefore, we explicitly
	// keep the vl at VLMAX if there is at least VLMAX remaining.
	bge LEN32, VLMAX, 2b
	j 1b

	.Lmain_loop_done\@:
	load_x

	// Compute the next tweak.
	addi t0, VL, -4
	vsetivli zero, 4, e32, m4, ta, ma
	vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak
	vsetivli zero, 4, e32, m1, ta, ma
	vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak

	bnez TAIL_LEN, .Lcts\@

	// Update *TWEAKP to contain the next tweak.
	vbrev8.v TWEAKS, TWEAKS_BREV
	vse32.v TWEAKS, (TWEAKP)
	ret

	.Lcts_without_main_loop\@:
	load_x
	.Lcts\@:
	// TWEAKS_BREV now contains the next tweak. Compute the one after that.
	vsetivli zero, 4, e32, m1, ta, ma
	vmv.v.v TMP0, TWEAKS_BREV
	vgmul.vv TMP0, MULTS_BREV
	// Undo the bit reversal of the next two tweaks and store them in TMP1
	// and TMP2, such that TMP1 is the first needed and TMP2 the second.
	.if \enc
	vbrev8.v TMP1, TWEAKS_BREV
	vbrev8.v TMP2, TMP0
	.else
	vbrev8.v TMP1, TMP0
	vbrev8.v TMP2, TWEAKS_BREV
	.endif

	// Encrypt/decrypt the last full block.
	vle32.v TMP0, (INP)
	vxor.vv TMP0, TMP0, TMP1
	aes_crypt TMP0, \enc, \keylen
	vxor.vv TMP0, TMP0, TMP1

	// Swap the first TAIL_LEN bytes of the above result with the tail.
	// Note that to support in-place encryption/decryption, the load from
	// the input tail must happen before the store to the output tail.
	addi t0, INP, 16
	addi t1, OUTP, 16
	vmv.v.v TMP3, TMP0
	vsetvli zero, TAIL_LEN, e8, m1, tu, ma
	vle8.v TMP0, (t0)
	vse8.v TMP3, (t1)

	// Encrypt/decrypt again and store the last full block.
	vsetivli zero, 4, e32, m1, ta, ma
	vxor.vv TMP0, TMP0, TMP2
	aes_crypt TMP0, \enc, \keylen
	vxor.vv TMP0, TMP0, TMP2
	vse32.v TMP0, (OUTP)

	ret
	.endm

	.macro aes_xts_crypt enc

	// Check whether the length is a multiple of the AES block size.
	andi TAIL_LEN, LEN, 15
	beqz TAIL_LEN, 1f

	// The length isn't a multiple of the AES block size, so ciphertext
	// stealing will be required. Ciphertext stealing involves special
	// handling of the partial block and the last full block, so subtract
	// the length of both from the length to be processed in the main loop.
	sub LEN, LEN, TAIL_LEN
	addi LEN, LEN, -16
	1:
	srli LEN32, LEN, 2
	// LEN and LEN32 now contain the total length of the blocks that will be
	// processed in the main loop, in bytes and 32-bit words respectively.

	xts_init
	aes_begin KEYP, 128f, 192f
	__aes_xts_crypt \enc, 256
	128:
	__aes_xts_crypt \enc, 128
	192:
	__aes_xts_crypt \enc, 192
	.endm

	// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
	// const u8 in, u8 out, size_t len,
	// u8 tweak[16]);
	//
	// \|key\| is the data key. \|tweak\| contains the next tweak; the encryption of
	// the original IV with the tweak key was already done. This function supports
	// incremental computation, but \|len\| must always be >= 16 (AES_BLOCK_SIZE), and
	// \|len\| must be a multiple of 16 except on the last call. If \|len\| is a
	// multiple of 16, then this function updates \|tweak\| to contain the next tweak.
	SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
	aes_xts_crypt 1
	SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)

	// Same prototype and calling convention as the encryption function
	SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
	aes_xts_crypt 0
	SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)