| /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ |
| // |
| // AES-NI optimized AES-GCM for x86_64 |
| // |
| // Copyright 2024 Google LLC |
| // |
| // Author: Eric Biggers <ebiggers@google.com> |
| // |
| //------------------------------------------------------------------------------ |
| // |
| // This file is dual-licensed, meaning that you can use it under your choice of |
| // either of the following two licenses: |
| // |
| // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy |
| // of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // or |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are met: |
| // |
| // 1. Redistributions of source code must retain the above copyright notice, |
| // this list of conditions and the following disclaimer. |
| // |
| // 2. Redistributions in binary form must reproduce the above copyright |
| // notice, this list of conditions and the following disclaimer in the |
| // documentation and/or other materials provided with the distribution. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| // POSSIBILITY OF SUCH DAMAGE. |
| // |
| //------------------------------------------------------------------------------ |
| // |
| // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that |
| // support the original set of AES instructions, i.e. AES-NI. Two |
| // implementations are provided, one that uses AVX and one that doesn't. They |
| // are very similar, being generated by the same macros. The only difference is |
| // that the AVX implementation takes advantage of VEX-coded instructions in some |
| // places to avoid some 'movdqu' and 'movdqa' instructions. The AVX |
| // implementation does *not* use 256-bit vectors, as AES is not supported on |
| // 256-bit vectors until the VAES feature (which this file doesn't target). |
| // |
| // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1 |
| // for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems |
| // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) |
| // |
| // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is |
| // more thoroughly commented. This file has the following notable changes: |
| // |
| // - The vector length is fixed at 128-bit, i.e. xmm registers. This means |
| // there is only one AES block (and GHASH block) per register. |
| // |
| // - Without AVX512 / AVX10, only 16 SIMD registers are available instead of |
| // 32. We work around this by being much more careful about using |
| // registers, relying heavily on loads to load values as they are needed. |
| // |
| // - Masking is not available either. We work around this by implementing |
| // partial block loads and stores using overlapping scalar loads and stores |
| // combined with shifts and SSE4.1 insertion and extraction instructions. |
| // |
| // - The main loop is organized differently due to the different design |
| // constraints. First, with just one AES block per SIMD register, on some |
| // CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore |
| // do an 8-register wide loop. Considering that and the fact that we have |
| // just 16 SIMD registers to work with, it's not feasible to cache AES |
| // round keys and GHASH key powers in registers across loop iterations. |
| // That's not ideal, but also not actually that bad, since loads can run in |
| // parallel with other instructions. Significantly, this also makes it |
| // possible to roll up the inner loops, relying on hardware loop unrolling |
| // instead of software loop unrolling, greatly reducing code size. |
| // |
| // - We implement the GHASH multiplications in the main loop using Karatsuba |
| // multiplication instead of schoolbook multiplication. This saves one |
| // pclmulqdq instruction per block, at the cost of one 64-bit load, one |
| // pshufd, and 0.25 pxors per block. (This is without the three-argument |
| // XOR support that would be provided by AVX512 / AVX10, which would be |
| // more beneficial to schoolbook than Karatsuba.) |
| // |
| // As a rough approximation, we can assume that Karatsuba multiplication is |
| // faster than schoolbook multiplication in this context if one pshufd and |
| // 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit |
| // load is "free" due to running in parallel with arithmetic instructions.) |
| // This is true on AMD CPUs, including all that support pclmulqdq up to at |
| // least Zen 3. It's also true on older Intel CPUs: Westmere through |
| // Haswell on the Core side, and Silvermont through Goldmont Plus on the |
| // low-power side. On some of these CPUs, pclmulqdq is quite slow, and the |
| // benefit of Karatsuba should be substantial. On newer Intel CPUs, |
| // schoolbook multiplication should be faster, but only marginally. |
| // |
| // Not all these CPUs were available to be tested. However, benchmarks on |
| // available CPUs suggest that this approximation is plausible. Switching |
| // to Karatsuba showed negligible change (< 1%) on Intel Broadwell, |
| // Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%. |
| // Considering that and the fact that Karatsuba should be even more |
| // beneficial on older Intel CPUs, it seems like the right choice here. |
| // |
| // An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be |
| // saved by using a multiplication-less reduction method. We don't do that |
| // because it would require a large number of shift and xor instructions, |
| // making it less worthwhile and likely harmful on newer CPUs. |
| // |
| // It does make sense to sometimes use a different reduction optimization |
| // that saves a pclmulqdq, though: precompute the hash key times x^64, and |
| // multiply the low half of the data block by the hash key with the extra |
| // factor of x^64. This eliminates one step of the reduction. However, |
| // this is incompatible with Karatsuba multiplication. Therefore, for |
| // multi-block processing we use Karatsuba multiplication with a regular |
| // reduction. For single-block processing, we use the x^64 optimization. |
| |
| #include <linux/linkage.h> |
| |
| .section .rodata |
| .p2align 4 |
| .Lbswap_mask: |
| .octa 0x000102030405060708090a0b0c0d0e0f |
| .Lgfpoly: |
| .quad 0xc200000000000000 |
| .Lone: |
| .quad 1 |
| .Lgfpoly_and_internal_carrybit: |
| .octa 0xc2000000000000010000000000000001 |
| // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of |
| // 'len' 0xff bytes and the rest zeroes. |
| .Lzeropad_mask: |
| .octa 0xffffffffffffffffffffffffffffffff |
| .octa 0 |
| |
| // Offsets in struct aes_gcm_key_aesni |
| #define OFFSETOF_AESKEYLEN 480 |
| #define OFFSETOF_H_POWERS 496 |
| #define OFFSETOF_H_POWERS_XORED 624 |
| #define OFFSETOF_H_TIMES_X64 688 |
| |
| .text |
| |
| // Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback |
| // assumes that all operands are distinct and that any mem operand is aligned. |
| .macro _vpclmulqdq imm, src1, src2, dst |
| .if USE_AVX |
| vpclmulqdq \imm, \src1, \src2, \dst |
| .else |
| movdqa \src2, \dst |
| pclmulqdq \imm, \src1, \dst |
| .endif |
| .endm |
| |
| // Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes |
| // that all operands are distinct and that any mem operand is aligned. |
| .macro _vpshufb src1, src2, dst |
| .if USE_AVX |
| vpshufb \src1, \src2, \dst |
| .else |
| movdqa \src2, \dst |
| pshufb \src1, \dst |
| .endif |
| .endm |
| |
| // Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that |
| // all operands are distinct. |
| .macro _vpand src1, src2, dst |
| .if USE_AVX |
| vpand \src1, \src2, \dst |
| .else |
| movdqu \src1, \dst |
| pand \src2, \dst |
| .endif |
| .endm |
| |
| // XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must |
| // be a temporary xmm register. |
| .macro _xor_mem_to_reg mem, reg, tmp |
| .if USE_AVX |
| vpxor \mem, \reg, \reg |
| .else |
| movdqu \mem, \tmp |
| pxor \tmp, \reg |
| .endif |
| .endm |
| |
| // Test the unaligned memory operand \mem against the xmm register \reg. \tmp |
| // must be a temporary xmm register. |
| .macro _test_mem mem, reg, tmp |
| .if USE_AVX |
| vptest \mem, \reg |
| .else |
| movdqu \mem, \tmp |
| ptest \tmp, \reg |
| .endif |
| .endm |
| |
| // Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst |
| // and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. |
| .macro _load_partial_block src, dst, tmp64, tmp32 |
| sub $8, %ecx // LEN - 8 |
| jle .Lle8\@ |
| |
| // Load 9 <= LEN <= 15 bytes. |
| movq (\src), \dst // Load first 8 bytes |
| mov (\src, %rcx), %rax // Load last 8 bytes |
| neg %ecx |
| shl $3, %ecx |
| shr %cl, %rax // Discard overlapping bytes |
| pinsrq $1, %rax, \dst |
| jmp .Ldone\@ |
| |
| .Lle8\@: |
| add $4, %ecx // LEN - 4 |
| jl .Llt4\@ |
| |
| // Load 4 <= LEN <= 8 bytes. |
| mov (\src), %eax // Load first 4 bytes |
| mov (\src, %rcx), \tmp32 // Load last 4 bytes |
| jmp .Lcombine\@ |
| |
| .Llt4\@: |
| // Load 1 <= LEN <= 3 bytes. |
| add $2, %ecx // LEN - 2 |
| movzbl (\src), %eax // Load first byte |
| jl .Lmovq\@ |
| movzwl (\src, %rcx), \tmp32 // Load last 2 bytes |
| .Lcombine\@: |
| shl $3, %ecx |
| shl %cl, \tmp64 |
| or \tmp64, %rax // Combine the two parts |
| .Lmovq\@: |
| movq %rax, \dst |
| .Ldone\@: |
| .endm |
| |
| // Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. |
| // Clobbers %rax, %rcx, and %rsi. |
| .macro _store_partial_block src, dst |
| sub $8, %ecx // LEN - 8 |
| jl .Llt8\@ |
| |
| // Store 8 <= LEN <= 15 bytes. |
| pextrq $1, \src, %rax |
| mov %ecx, %esi |
| shl $3, %ecx |
| ror %cl, %rax |
| mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes |
| movq \src, (\dst) // Store first 8 bytes |
| jmp .Ldone\@ |
| |
| .Llt8\@: |
| add $4, %ecx // LEN - 4 |
| jl .Llt4\@ |
| |
| // Store 4 <= LEN <= 7 bytes. |
| pextrd $1, \src, %eax |
| mov %ecx, %esi |
| shl $3, %ecx |
| ror %cl, %eax |
| mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes |
| movd \src, (\dst) // Store first 4 bytes |
| jmp .Ldone\@ |
| |
| .Llt4\@: |
| // Store 1 <= LEN <= 3 bytes. |
| pextrb $0, \src, 0(\dst) |
| cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? |
| jl .Ldone\@ |
| pextrb $1, \src, 1(\dst) |
| je .Ldone\@ |
| pextrb $2, \src, 2(\dst) |
| .Ldone\@: |
| .endm |
| |
| // Do one step of GHASH-multiplying \a by \b and storing the reduced product in |
| // \b. To complete all steps, this must be invoked with \i=0 through \i=9. |
| // \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the |
| // .Lgfpoly constant, and \t0-\t1 must be temporary registers. |
| .macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1 |
| |
| // MI = (a_L * b_H) + ((a*x^64)_L * b_L) |
| .if \i == 0 |
| _vpclmulqdq $0x01, \a, \b, \t0 |
| .elseif \i == 1 |
| _vpclmulqdq $0x00, \a_times_x64, \b, \t1 |
| .elseif \i == 2 |
| pxor \t1, \t0 |
| |
| // HI = (a_H * b_H) + ((a*x^64)_H * b_L) |
| .elseif \i == 3 |
| _vpclmulqdq $0x11, \a, \b, \t1 |
| .elseif \i == 4 |
| pclmulqdq $0x10, \a_times_x64, \b |
| .elseif \i == 5 |
| pxor \t1, \b |
| .elseif \i == 6 |
| |
| // Fold MI into HI. |
| pshufd $0x4e, \t0, \t1 // Swap halves of MI |
| .elseif \i == 7 |
| pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) |
| .elseif \i == 8 |
| pxor \t1, \b |
| .elseif \i == 9 |
| pxor \t0, \b |
| .endif |
| .endm |
| |
| // GHASH-multiply \a by \b and store the reduced product in \b. |
| // See _ghash_mul_step for details. |
| .macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1 |
| .irp i, 0,1,2,3,4,5,6,7,8,9 |
| _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1 |
| .endr |
| .endm |
| |
| // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi. |
| // This does Karatsuba multiplication and must be paired with _ghash_reduce. On |
| // the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the |
| // two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered. |
| .macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0 |
| |
| // LO += a_L * b_L |
| _vpclmulqdq $0x00, \a, \b, \t0 |
| pxor \t0, \lo |
| |
| // b_L + b_H |
| pshufd $0x4e, \b, \t0 |
| pxor \b, \t0 |
| |
| // HI += a_H * b_H |
| pclmulqdq $0x11, \a, \b |
| pxor \b, \hi |
| |
| // MI += (a_L + a_H) * (b_L + b_H) |
| pclmulqdq $0x00, \a_xored, \t0 |
| pxor \t0, \mi |
| .endm |
| |
| // Reduce the product from \lo, \mi, and \hi, and store the result in \dst. |
| // This assumes that _ghash_mul_noreduce was used. |
| .macro _ghash_reduce lo, mi, hi, dst, t0 |
| |
| movq .Lgfpoly(%rip), \t0 |
| |
| // MI += LO + HI (needed because we used Karatsuba multiplication) |
| pxor \lo, \mi |
| pxor \hi, \mi |
| |
| // Fold LO into MI. |
| pshufd $0x4e, \lo, \dst |
| pclmulqdq $0x00, \t0, \lo |
| pxor \dst, \mi |
| pxor \lo, \mi |
| |
| // Fold MI into HI. |
| pshufd $0x4e, \mi, \dst |
| pclmulqdq $0x00, \t0, \mi |
| pxor \hi, \dst |
| pxor \mi, \dst |
| .endm |
| |
| // Do the first step of the GHASH update of a set of 8 ciphertext blocks. |
| // |
| // The whole GHASH update does: |
| // |
| // GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 + |
| // blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1 |
| // |
| // This macro just does the first step: it does the unreduced multiplication |
| // (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm |
| // registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the |
| // inner block counter in %rax, which is a value that counts up by 8 for each |
| // block in the set of 8 and is used later to index by 8*blknum and 16*blknum. |
| // |
| // To reduce the number of pclmulqdq instructions required, both this macro and |
| // _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook |
| // multiplication. See the file comment for more details about this choice. |
| // |
| // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if |
| // encrypting, or SRC if decrypting. They also expect the precomputed hash key |
| // powers H^i and their XOR'd-together halves to be available in the struct |
| // pointed to by KEY. Both macros clobber TMP[0-2]. |
| .macro _ghash_update_begin_8x enc |
| |
| // Initialize the inner block counter. |
| xor %eax, %eax |
| |
| // Load the highest hash key power, H^8. |
| movdqa OFFSETOF_H_POWERS(KEY), TMP0 |
| |
| // Load the first ciphertext block and byte-reflect it. |
| .if \enc |
| movdqu (DST), TMP1 |
| .else |
| movdqu (SRC), TMP1 |
| .endif |
| pshufb BSWAP_MASK, TMP1 |
| |
| // Add the GHASH accumulator to the ciphertext block to get the block |
| // 'b' that needs to be multiplied with the hash key power 'a'. |
| pxor TMP1, GHASH_ACC |
| |
| // b_L + b_H |
| pshufd $0x4e, GHASH_ACC, MI |
| pxor GHASH_ACC, MI |
| |
| // LO = a_L * b_L |
| _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO |
| |
| // HI = a_H * b_H |
| pclmulqdq $0x11, TMP0, GHASH_ACC |
| |
| // MI = (a_L + a_H) * (b_L + b_H) |
| pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI |
| .endm |
| |
| // Continue the GHASH update of 8 ciphertext blocks as described above by doing |
| // an unreduced multiplication of the next ciphertext block by the next lowest |
| // key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI. |
| .macro _ghash_update_continue_8x enc |
| add $8, %eax |
| |
| // Load the next lowest key power. |
| movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0 |
| |
| // Load the next ciphertext block and byte-reflect it. |
| .if \enc |
| movdqu (DST,%rax,2), TMP1 |
| .else |
| movdqu (SRC,%rax,2), TMP1 |
| .endif |
| pshufb BSWAP_MASK, TMP1 |
| |
| // LO += a_L * b_L |
| _vpclmulqdq $0x00, TMP0, TMP1, TMP2 |
| pxor TMP2, LO |
| |
| // b_L + b_H |
| pshufd $0x4e, TMP1, TMP2 |
| pxor TMP1, TMP2 |
| |
| // HI += a_H * b_H |
| pclmulqdq $0x11, TMP0, TMP1 |
| pxor TMP1, GHASH_ACC |
| |
| // MI += (a_L + a_H) * (b_L + b_H) |
| movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1 |
| pclmulqdq $0x00, TMP1, TMP2 |
| pxor TMP2, MI |
| .endm |
| |
| // Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to |
| // _ghash_reduce, but it's hardcoded to use the registers of the main loop and |
| // it uses the same register for HI and the destination. It's also divided into |
| // two steps. TMP1 must be preserved across steps. |
| // |
| // One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of |
| // shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would |
| // increase the critical path length, and it seems to slightly hurt performance. |
| .macro _ghash_update_end_8x_step i |
| .if \i == 0 |
| movq .Lgfpoly(%rip), TMP1 |
| pxor LO, MI |
| pxor GHASH_ACC, MI |
| pshufd $0x4e, LO, TMP2 |
| pclmulqdq $0x00, TMP1, LO |
| pxor TMP2, MI |
| pxor LO, MI |
| .elseif \i == 1 |
| pshufd $0x4e, MI, TMP2 |
| pclmulqdq $0x00, TMP1, MI |
| pxor TMP2, GHASH_ACC |
| pxor MI, GHASH_ACC |
| .endif |
| .endm |
| |
| // void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key); |
| // |
| // Given the expanded AES key, derive the GHASH subkey and initialize the GHASH |
| // related fields in the key struct. |
| .macro _aes_gcm_precompute |
| |
| // Function arguments |
| .set KEY, %rdi |
| |
| // Additional local variables. |
| // %xmm0-%xmm1 and %rax are used as temporaries. |
| .set RNDKEYLAST_PTR, %rsi |
| .set H_CUR, %xmm2 |
| .set H_POW1, %xmm3 // H^1 |
| .set H_POW1_X64, %xmm4 // H^1 * x^64 |
| .set GFPOLY, %xmm5 |
| |
| // Encrypt an all-zeroes block to get the raw hash subkey. |
| movl OFFSETOF_AESKEYLEN(KEY), %eax |
| lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR |
| movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block |
| lea 16(KEY), %rax |
| 1: |
| aesenc (%rax), H_POW1 |
| add $16, %rax |
| cmp %rax, RNDKEYLAST_PTR |
| jne 1b |
| aesenclast (RNDKEYLAST_PTR), H_POW1 |
| |
| // Preprocess the raw hash subkey as needed to operate on GHASH's |
| // bit-reflected values directly: reflect its bytes, then multiply it by |
| // x^-1 (using the backwards interpretation of polynomial coefficients |
| // from the GCM spec) or equivalently x^1 (using the alternative, |
| // natural interpretation of polynomial coefficients). |
| pshufb .Lbswap_mask(%rip), H_POW1 |
| movdqa H_POW1, %xmm0 |
| pshufd $0xd3, %xmm0, %xmm0 |
| psrad $31, %xmm0 |
| paddq H_POW1, H_POW1 |
| pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0 |
| pxor %xmm0, H_POW1 |
| |
| // Store H^1. |
| movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY) |
| |
| // Compute and store H^1 * x^64. |
| movq .Lgfpoly(%rip), GFPOLY |
| pshufd $0x4e, H_POW1, %xmm0 |
| _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64 |
| pxor %xmm0, H_POW1_X64 |
| movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY) |
| |
| // Compute and store the halves of H^1 XOR'd together. |
| pxor H_POW1, %xmm0 |
| movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY) |
| |
| // Compute and store the remaining key powers H^2 through H^8. |
| movdqa H_POW1, H_CUR |
| mov $6*8, %eax |
| .Lprecompute_next\@: |
| // Compute H^i = H^{i-1} * H^1. |
| _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1 |
| // Store H^i. |
| movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2) |
| // Compute and store the halves of H^i XOR'd together. |
| pshufd $0x4e, H_CUR, %xmm0 |
| pxor H_CUR, %xmm0 |
| movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax) |
| sub $8, %eax |
| jge .Lprecompute_next\@ |
| |
| RET |
| .endm |
| |
| // void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, |
| // u8 ghash_acc[16], const u8 *aad, int aadlen); |
| // |
| // This function processes the AAD (Additional Authenticated Data) in GCM. |
| // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the |
| // data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all |
| // zeroes. |aadlen| must be a multiple of 16, except on the last call where it |
| // can be any length. The caller must do any buffering needed to ensure this. |
| .macro _aes_gcm_aad_update |
| |
| // Function arguments |
| .set KEY, %rdi |
| .set GHASH_ACC_PTR, %rsi |
| .set AAD, %rdx |
| .set AADLEN, %ecx |
| // Note: _load_partial_block relies on AADLEN being in %ecx. |
| |
| // Additional local variables. |
| // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers. |
| .set BSWAP_MASK, %xmm2 |
| .set GHASH_ACC, %xmm3 |
| .set H_POW1, %xmm4 // H^1 |
| .set H_POW1_X64, %xmm5 // H^1 * x^64 |
| .set GFPOLY, %xmm6 |
| |
| movdqa .Lbswap_mask(%rip), BSWAP_MASK |
| movdqu (GHASH_ACC_PTR), GHASH_ACC |
| movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 |
| movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 |
| movq .Lgfpoly(%rip), GFPOLY |
| |
| // Process the AAD one full block at a time. |
| sub $16, AADLEN |
| jl .Laad_loop_1x_done\@ |
| .Laad_loop_1x\@: |
| movdqu (AAD), %xmm0 |
| pshufb BSWAP_MASK, %xmm0 |
| pxor %xmm0, GHASH_ACC |
| _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 |
| add $16, AAD |
| sub $16, AADLEN |
| jge .Laad_loop_1x\@ |
| .Laad_loop_1x_done\@: |
| // Check whether there is a partial block at the end. |
| add $16, AADLEN |
| jz .Laad_done\@ |
| |
| // Process a partial block of length 1 <= AADLEN <= 15. |
| // _load_partial_block assumes that %ecx contains AADLEN. |
| _load_partial_block AAD, %xmm0, %r10, %r10d |
| pshufb BSWAP_MASK, %xmm0 |
| pxor %xmm0, GHASH_ACC |
| _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 |
| |
| .Laad_done\@: |
| movdqu GHASH_ACC, (GHASH_ACC_PTR) |
| RET |
| .endm |
| |
| // Increment LE_CTR eight times to generate eight little-endian counter blocks, |
| // swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with |
| // the zero-th AES round key. Clobbers TMP0 and TMP1. |
| .macro _ctr_begin_8x |
| movq .Lone(%rip), TMP0 |
| movdqa (KEY), TMP1 // zero-th round key |
| .irp i, 0,1,2,3,4,5,6,7 |
| _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i |
| pxor TMP1, AESDATA\i |
| paddd TMP0, LE_CTR |
| .endr |
| .endm |
| |
| // Do a non-last round of AES on AESDATA[0-7] using \round_key. |
| .macro _aesenc_8x round_key |
| .irp i, 0,1,2,3,4,5,6,7 |
| aesenc \round_key, AESDATA\i |
| .endr |
| .endm |
| |
| // Do the last round of AES on AESDATA[0-7] using \round_key. |
| .macro _aesenclast_8x round_key |
| .irp i, 0,1,2,3,4,5,6,7 |
| aesenclast \round_key, AESDATA\i |
| .endr |
| .endm |
| |
| // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and |
| // store the result to DST. Clobbers TMP0. |
| .macro _xor_data_8x |
| .irp i, 0,1,2,3,4,5,6,7 |
| _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0 |
| .endr |
| .irp i, 0,1,2,3,4,5,6,7 |
| movdqu AESDATA\i, \i*16(DST) |
| .endr |
| .endm |
| |
| // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key, |
| // const u32 le_ctr[4], u8 ghash_acc[16], |
| // const u8 *src, u8 *dst, int datalen); |
| // |
| // This macro generates a GCM encryption or decryption update function with the |
| // above prototype (with \enc selecting which one). |
| // |
| // This function computes the next portion of the CTR keystream, XOR's it with |
| // |datalen| bytes from |src|, and writes the resulting encrypted or decrypted |
| // data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the |
| // next |datalen| ciphertext bytes. |
| // |
| // |datalen| must be a multiple of 16, except on the last call where it can be |
| // any length. The caller must do any buffering needed to ensure this. Both |
| // in-place and out-of-place en/decryption are supported. |
| // |
| // |le_ctr| must give the current counter in little-endian format. For a new |
| // message, the low word of the counter must be 2. This function loads the |
| // counter from |le_ctr| and increments the loaded counter as needed, but it |
| // does *not* store the updated counter back to |le_ctr|. The caller must |
| // update |le_ctr| if any more data segments follow. Internally, only the low |
| // 32-bit word of the counter is incremented, following the GCM standard. |
| .macro _aes_gcm_update enc |
| |
| // Function arguments |
| .set KEY, %rdi |
| .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg |
| .set GHASH_ACC_PTR, %rdx |
| .set SRC, %rcx |
| .set DST, %r8 |
| .set DATALEN, %r9d |
| .set DATALEN64, %r9 // Zero-extend DATALEN before using! |
| // Note: the code setting up for _load_partial_block assumes that SRC is |
| // in %rcx (and that DATALEN is *not* in %rcx). |
| |
| // Additional local variables |
| |
| // %rax and %rsi are used as temporary registers. Note: %rsi overlaps |
| // with LE_CTR_PTR, which is used only at the beginning. |
| |
| .set AESKEYLEN, %r10d // AES key length in bytes |
| .set AESKEYLEN64, %r10 |
| .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key |
| |
| // Put the most frequently used values in %xmm0-%xmm7 to reduce code |
| // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.) |
| .set TMP0, %xmm0 |
| .set TMP1, %xmm1 |
| .set TMP2, %xmm2 |
| .set LO, %xmm3 // Low part of unreduced product |
| .set MI, %xmm4 // Middle part of unreduced product |
| .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also |
| // the high part of unreduced product |
| .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes |
| .set LE_CTR, %xmm7 // Little-endian counter value |
| .set AESDATA0, %xmm8 |
| .set AESDATA1, %xmm9 |
| .set AESDATA2, %xmm10 |
| .set AESDATA3, %xmm11 |
| .set AESDATA4, %xmm12 |
| .set AESDATA5, %xmm13 |
| .set AESDATA6, %xmm14 |
| .set AESDATA7, %xmm15 |
| |
| movdqa .Lbswap_mask(%rip), BSWAP_MASK |
| movdqu (GHASH_ACC_PTR), GHASH_ACC |
| movdqu (LE_CTR_PTR), LE_CTR |
| |
| movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN |
| lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR |
| |
| // If there are at least 8*16 bytes of data, then continue into the main |
| // loop, which processes 8*16 bytes of data per iteration. |
| // |
| // The main loop interleaves AES and GHASH to improve performance on |
| // CPUs that can execute these instructions in parallel. When |
| // decrypting, the GHASH input (the ciphertext) is immediately |
| // available. When encrypting, we instead encrypt a set of 8 blocks |
| // first and then GHASH those blocks while encrypting the next set of 8, |
| // repeat that as needed, and finally GHASH the last set of 8 blocks. |
| // |
| // Code size optimization: Prefer adding or subtracting -8*16 over 8*16, |
| // as this makes the immediate fit in a signed byte, saving 3 bytes. |
| add $-8*16, DATALEN |
| jl .Lcrypt_loop_8x_done\@ |
| .if \enc |
| // Encrypt the first 8 plaintext blocks. |
| _ctr_begin_8x |
| lea 16(KEY), %rsi |
| .p2align 4 |
| 1: |
| movdqa (%rsi), TMP0 |
| _aesenc_8x TMP0 |
| add $16, %rsi |
| cmp %rsi, RNDKEYLAST_PTR |
| jne 1b |
| movdqa (%rsi), TMP0 |
| _aesenclast_8x TMP0 |
| _xor_data_8x |
| // Don't increment DST until the ciphertext blocks have been hashed. |
| sub $-8*16, SRC |
| add $-8*16, DATALEN |
| jl .Lghash_last_ciphertext_8x\@ |
| .endif |
| |
| .p2align 4 |
| .Lcrypt_loop_8x\@: |
| |
| // Generate the next set of 8 counter blocks and start encrypting them. |
| _ctr_begin_8x |
| lea 16(KEY), %rsi |
| |
| // Do a round of AES, and start the GHASH update of 8 ciphertext blocks |
| // by doing the unreduced multiplication for the first ciphertext block. |
| movdqa (%rsi), TMP0 |
| add $16, %rsi |
| _aesenc_8x TMP0 |
| _ghash_update_begin_8x \enc |
| |
| // Do 7 more rounds of AES, and continue the GHASH update by doing the |
| // unreduced multiplication for the remaining ciphertext blocks. |
| .p2align 4 |
| 1: |
| movdqa (%rsi), TMP0 |
| add $16, %rsi |
| _aesenc_8x TMP0 |
| _ghash_update_continue_8x \enc |
| cmp $7*8, %eax |
| jne 1b |
| |
| // Do the remaining AES rounds. |
| .p2align 4 |
| 1: |
| movdqa (%rsi), TMP0 |
| add $16, %rsi |
| _aesenc_8x TMP0 |
| cmp %rsi, RNDKEYLAST_PTR |
| jne 1b |
| |
| // Do the GHASH reduction and the last round of AES. |
| movdqa (RNDKEYLAST_PTR), TMP0 |
| _ghash_update_end_8x_step 0 |
| _aesenclast_8x TMP0 |
| _ghash_update_end_8x_step 1 |
| |
| // XOR the data with the AES-CTR keystream blocks. |
| .if \enc |
| sub $-8*16, DST |
| .endif |
| _xor_data_8x |
| sub $-8*16, SRC |
| .if !\enc |
| sub $-8*16, DST |
| .endif |
| add $-8*16, DATALEN |
| jge .Lcrypt_loop_8x\@ |
| |
| .if \enc |
| .Lghash_last_ciphertext_8x\@: |
| // Update GHASH with the last set of 8 ciphertext blocks. |
| _ghash_update_begin_8x \enc |
| .p2align 4 |
| 1: |
| _ghash_update_continue_8x \enc |
| cmp $7*8, %eax |
| jne 1b |
| _ghash_update_end_8x_step 0 |
| _ghash_update_end_8x_step 1 |
| sub $-8*16, DST |
| .endif |
| |
| .Lcrypt_loop_8x_done\@: |
| |
| sub $-8*16, DATALEN |
| jz .Ldone\@ |
| |
| // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep |
| // things simple and keep the code size down by just going one block at |
| // a time, again taking advantage of hardware loop unrolling. Since |
| // there are enough key powers available for all remaining data, we do |
| // the GHASH multiplications unreduced, and only reduce at the very end. |
| |
| .set HI, TMP2 |
| .set H_POW, AESDATA0 |
| .set H_POW_XORED, AESDATA1 |
| .set ONE, AESDATA2 |
| |
| movq .Lone(%rip), ONE |
| |
| // Start collecting the unreduced GHASH intermediate value LO, MI, HI. |
| pxor LO, LO |
| pxor MI, MI |
| pxor HI, HI |
| |
| // Set up a block counter %rax to contain 8*(8-n), where n is the number |
| // of blocks that remain, counting any partial block. This will be used |
| // to access the key powers H^n through H^1. |
| mov DATALEN, %eax |
| neg %eax |
| and $~15, %eax |
| sar $1, %eax |
| add $64, %eax |
| |
| sub $16, DATALEN |
| jl .Lcrypt_loop_1x_done\@ |
| |
| // Process the data one full block at a time. |
| .Lcrypt_loop_1x\@: |
| |
| // Encrypt the next counter block. |
| _vpshufb BSWAP_MASK, LE_CTR, TMP0 |
| paddd ONE, LE_CTR |
| pxor (KEY), TMP0 |
| lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size |
| cmp $24, AESKEYLEN |
| jl 128f // AES-128? |
| je 192f // AES-192? |
| // AES-256 |
| aesenc -7*16(%rsi), TMP0 |
| aesenc -6*16(%rsi), TMP0 |
| 192: |
| aesenc -5*16(%rsi), TMP0 |
| aesenc -4*16(%rsi), TMP0 |
| 128: |
| .irp i, -3,-2,-1,0,1,2,3,4,5 |
| aesenc \i*16(%rsi), TMP0 |
| .endr |
| aesenclast (RNDKEYLAST_PTR), TMP0 |
| |
| // Load the next key power H^i. |
| movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW |
| movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED |
| |
| // XOR the keystream block that was just generated in TMP0 with the next |
| // source data block and store the resulting en/decrypted data to DST. |
| .if \enc |
| _xor_mem_to_reg (SRC), TMP0, tmp=TMP1 |
| movdqu TMP0, (DST) |
| .else |
| movdqu (SRC), TMP1 |
| pxor TMP1, TMP0 |
| movdqu TMP0, (DST) |
| .endif |
| |
| // Update GHASH with the ciphertext block. |
| .if \enc |
| pshufb BSWAP_MASK, TMP0 |
| pxor TMP0, GHASH_ACC |
| .else |
| pshufb BSWAP_MASK, TMP1 |
| pxor TMP1, GHASH_ACC |
| .endif |
| _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 |
| pxor GHASH_ACC, GHASH_ACC |
| |
| add $8, %eax |
| add $16, SRC |
| add $16, DST |
| sub $16, DATALEN |
| jge .Lcrypt_loop_1x\@ |
| .Lcrypt_loop_1x_done\@: |
| // Check whether there is a partial block at the end. |
| add $16, DATALEN |
| jz .Lghash_reduce\@ |
| |
| // Process a partial block of length 1 <= DATALEN <= 15. |
| |
| // Encrypt a counter block for the last time. |
| pshufb BSWAP_MASK, LE_CTR |
| pxor (KEY), LE_CTR |
| lea 16(KEY), %rsi |
| 1: |
| aesenc (%rsi), LE_CTR |
| add $16, %rsi |
| cmp %rsi, RNDKEYLAST_PTR |
| jne 1b |
| aesenclast (RNDKEYLAST_PTR), LE_CTR |
| |
| // Load the lowest key power, H^1. |
| movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW |
| movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED |
| |
| // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is |
| // in %rcx, but _load_partial_block needs DATALEN in %rcx instead. |
| // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC. |
| mov SRC, RNDKEYLAST_PTR |
| mov DATALEN, %ecx |
| _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi |
| |
| // XOR the keystream block that was just generated in LE_CTR with the |
| // source data block and store the resulting en/decrypted data to DST. |
| pxor TMP0, LE_CTR |
| mov DATALEN, %ecx |
| _store_partial_block LE_CTR, DST |
| |
| // If encrypting, zero-pad the final ciphertext block for GHASH. (If |
| // decrypting, this was already done by _load_partial_block.) |
| .if \enc |
| lea .Lzeropad_mask+16(%rip), %rax |
| sub DATALEN64, %rax |
| _vpand (%rax), LE_CTR, TMP0 |
| .endif |
| |
| // Update GHASH with the final ciphertext block. |
| pshufb BSWAP_MASK, TMP0 |
| pxor TMP0, GHASH_ACC |
| _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 |
| |
| .Lghash_reduce\@: |
| // Finally, do the GHASH reduction. |
| _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0 |
| |
| .Ldone\@: |
| // Store the updated GHASH accumulator back to memory. |
| movdqu GHASH_ACC, (GHASH_ACC_PTR) |
| |
| RET |
| .endm |
| |
| // void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key, |
| // const u32 le_ctr[4], u8 ghash_acc[16], |
| // u64 total_aadlen, u64 total_datalen); |
| // bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key, |
| // const u32 le_ctr[4], const u8 ghash_acc[16], |
| // u64 total_aadlen, u64 total_datalen, |
| // const u8 tag[16], int taglen); |
| // |
| // This macro generates one of the above two functions (with \enc selecting |
| // which one). Both functions finish computing the GCM authentication tag by |
| // updating GHASH with the lengths block and encrypting the GHASH accumulator. |
| // |total_aadlen| and |total_datalen| must be the total length of the additional |
| // authenticated data and the en/decrypted data in bytes, respectively. |
| // |
| // The encryption function then stores the full-length (16-byte) computed |
| // authentication tag to |ghash_acc|. The decryption function instead loads the |
| // expected authentication tag (the one that was transmitted) from the 16-byte |
| // buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the |
| // computed tag in constant time, and returns true if and only if they match. |
| .macro _aes_gcm_final enc |
| |
| // Function arguments |
| .set KEY, %rdi |
| .set LE_CTR_PTR, %rsi |
| .set GHASH_ACC_PTR, %rdx |
| .set TOTAL_AADLEN, %rcx |
| .set TOTAL_DATALEN, %r8 |
| .set TAG, %r9 |
| .set TAGLEN, %r10d // Originally at 8(%rsp) |
| .set TAGLEN64, %r10 |
| |
| // Additional local variables. |
| // %rax and %xmm0-%xmm2 are used as temporary registers. |
| .set AESKEYLEN, %r11d |
| .set AESKEYLEN64, %r11 |
| .set BSWAP_MASK, %xmm3 |
| .set GHASH_ACC, %xmm4 |
| .set H_POW1, %xmm5 // H^1 |
| .set H_POW1_X64, %xmm6 // H^1 * x^64 |
| .set GFPOLY, %xmm7 |
| |
| movdqa .Lbswap_mask(%rip), BSWAP_MASK |
| movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN |
| |
| // Set up a counter block with 1 in the low 32-bit word. This is the |
| // counter that produces the ciphertext needed to encrypt the auth tag. |
| movdqu (LE_CTR_PTR), %xmm0 |
| mov $1, %eax |
| pinsrd $0, %eax, %xmm0 |
| |
| // Build the lengths block and XOR it into the GHASH accumulator. |
| movq TOTAL_DATALEN, GHASH_ACC |
| pinsrq $1, TOTAL_AADLEN, GHASH_ACC |
| psllq $3, GHASH_ACC // Bytes to bits |
| _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1 |
| |
| movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 |
| movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 |
| movq .Lgfpoly(%rip), GFPOLY |
| |
| // Make %rax point to the 6th from last AES round key. (Using signed |
| // byte offsets -7*16 through 6*16 decreases code size.) |
| lea (KEY,AESKEYLEN64,4), %rax |
| |
| // AES-encrypt the counter block and also multiply GHASH_ACC by H^1. |
| // Interleave the AES and GHASH instructions to improve performance. |
| pshufb BSWAP_MASK, %xmm0 |
| pxor (KEY), %xmm0 |
| cmp $24, AESKEYLEN |
| jl 128f // AES-128? |
| je 192f // AES-192? |
| // AES-256 |
| aesenc -7*16(%rax), %xmm0 |
| aesenc -6*16(%rax), %xmm0 |
| 192: |
| aesenc -5*16(%rax), %xmm0 |
| aesenc -4*16(%rax), %xmm0 |
| 128: |
| .irp i, 0,1,2,3,4,5,6,7,8 |
| aesenc (\i-3)*16(%rax), %xmm0 |
| _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 |
| .endr |
| aesenclast 6*16(%rax), %xmm0 |
| _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 |
| |
| // Undo the byte reflection of the GHASH accumulator. |
| pshufb BSWAP_MASK, GHASH_ACC |
| |
| // Encrypt the GHASH accumulator. |
| pxor %xmm0, GHASH_ACC |
| |
| .if \enc |
| // Return the computed auth tag. |
| movdqu GHASH_ACC, (GHASH_ACC_PTR) |
| .else |
| .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN! |
| |
| // Verify the auth tag in constant time by XOR'ing the transmitted and |
| // computed auth tags together and using the ptest instruction to check |
| // whether the first TAGLEN bytes of the result are zero. |
| _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0 |
| movl 8(%rsp), TAGLEN |
| lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR |
| sub TAGLEN64, ZEROPAD_MASK_PTR |
| xor %eax, %eax |
| _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0 |
| sete %al |
| .endif |
| RET |
| .endm |
| |
| .set USE_AVX, 0 |
| SYM_FUNC_START(aes_gcm_precompute_aesni) |
| _aes_gcm_precompute |
| SYM_FUNC_END(aes_gcm_precompute_aesni) |
| SYM_FUNC_START(aes_gcm_aad_update_aesni) |
| _aes_gcm_aad_update |
| SYM_FUNC_END(aes_gcm_aad_update_aesni) |
| SYM_FUNC_START(aes_gcm_enc_update_aesni) |
| _aes_gcm_update 1 |
| SYM_FUNC_END(aes_gcm_enc_update_aesni) |
| SYM_FUNC_START(aes_gcm_dec_update_aesni) |
| _aes_gcm_update 0 |
| SYM_FUNC_END(aes_gcm_dec_update_aesni) |
| SYM_FUNC_START(aes_gcm_enc_final_aesni) |
| _aes_gcm_final 1 |
| SYM_FUNC_END(aes_gcm_enc_final_aesni) |
| SYM_FUNC_START(aes_gcm_dec_final_aesni) |
| _aes_gcm_final 0 |
| SYM_FUNC_END(aes_gcm_dec_final_aesni) |
| |
| .set USE_AVX, 1 |
| SYM_FUNC_START(aes_gcm_precompute_aesni_avx) |
| _aes_gcm_precompute |
| SYM_FUNC_END(aes_gcm_precompute_aesni_avx) |
| SYM_FUNC_START(aes_gcm_aad_update_aesni_avx) |
| _aes_gcm_aad_update |
| SYM_FUNC_END(aes_gcm_aad_update_aesni_avx) |
| SYM_FUNC_START(aes_gcm_enc_update_aesni_avx) |
| _aes_gcm_update 1 |
| SYM_FUNC_END(aes_gcm_enc_update_aesni_avx) |
| SYM_FUNC_START(aes_gcm_dec_update_aesni_avx) |
| _aes_gcm_update 0 |
| SYM_FUNC_END(aes_gcm_dec_update_aesni_avx) |
| SYM_FUNC_START(aes_gcm_enc_final_aesni_avx) |
| _aes_gcm_final 1 |
| SYM_FUNC_END(aes_gcm_enc_final_aesni_avx) |
| SYM_FUNC_START(aes_gcm_dec_final_aesni_avx) |
| _aes_gcm_final 0 |
| SYM_FUNC_END(aes_gcm_dec_final_aesni_avx) |