Thomas Gleixner | d2912cb | 2019-06-04 10:11:33 +0200 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 2 | /* |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 3 | * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 4 | * |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 5 | * Copyright (C) 2015 - 2017 Linaro Ltd. |
| 6 | * Copyright (C) 2023 Google LLC. <ardb@google.com> |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 7 | */ |
| 8 | |
| 9 | #include <linux/linkage.h> |
| 10 | #include <asm/assembler.h> |
| 11 | |
Stefan Agner | 7548bf8 | 2020-03-02 00:37:14 +0100 | [diff] [blame] | 12 | .arch armv8-a |
| 13 | .fpu crypto-neon-fp-armv8 |
| 14 | |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 15 | SHASH .req q0 |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 16 | T1 .req q1 |
| 17 | XL .req q2 |
| 18 | XM .req q3 |
| 19 | XH .req q4 |
| 20 | IN1 .req q4 |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 21 | |
| 22 | SHASH_L .req d0 |
| 23 | SHASH_H .req d1 |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 24 | T1_L .req d2 |
| 25 | T1_H .req d3 |
| 26 | XL_L .req d4 |
| 27 | XL_H .req d5 |
| 28 | XM_L .req d6 |
| 29 | XM_H .req d7 |
| 30 | XH_L .req d8 |
| 31 | |
| 32 | t0l .req d10 |
| 33 | t0h .req d11 |
| 34 | t1l .req d12 |
| 35 | t1h .req d13 |
| 36 | t2l .req d14 |
| 37 | t2h .req d15 |
| 38 | t3l .req d16 |
| 39 | t3h .req d17 |
| 40 | t4l .req d18 |
| 41 | t4h .req d19 |
| 42 | |
| 43 | t0q .req q5 |
| 44 | t1q .req q6 |
| 45 | t2q .req q7 |
| 46 | t3q .req q8 |
| 47 | t4q .req q9 |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 48 | XH2 .req q9 |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 49 | |
| 50 | s1l .req d20 |
| 51 | s1h .req d21 |
| 52 | s2l .req d22 |
| 53 | s2h .req d23 |
| 54 | s3l .req d24 |
| 55 | s3h .req d25 |
| 56 | s4l .req d26 |
| 57 | s4h .req d27 |
| 58 | |
| 59 | MASK .req d28 |
| 60 | SHASH2_p8 .req d28 |
| 61 | |
| 62 | k16 .req d29 |
| 63 | k32 .req d30 |
| 64 | k48 .req d31 |
| 65 | SHASH2_p64 .req d31 |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 66 | |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 67 | HH .req q10 |
| 68 | HH3 .req q11 |
| 69 | HH4 .req q12 |
| 70 | HH34 .req q13 |
| 71 | |
| 72 | HH_L .req d20 |
| 73 | HH_H .req d21 |
| 74 | HH3_L .req d22 |
| 75 | HH3_H .req d23 |
| 76 | HH4_L .req d24 |
| 77 | HH4_H .req d25 |
| 78 | HH34_L .req d26 |
| 79 | HH34_H .req d27 |
| 80 | SHASH2_H .req d29 |
| 81 | |
| 82 | XL2 .req q5 |
| 83 | XM2 .req q6 |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 84 | T2 .req q7 |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 85 | T3 .req q8 |
| 86 | |
| 87 | XL2_L .req d10 |
| 88 | XL2_H .req d11 |
| 89 | XM2_L .req d12 |
| 90 | XM2_H .req d13 |
| 91 | T3_L .req d16 |
| 92 | T3_H .req d17 |
| 93 | |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 94 | .text |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 95 | |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 96 | .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 |
| 97 | vmull.p64 \rd, \rn, \rm |
| 98 | .endm |
| 99 | |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 100 | /* |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 101 | * This implementation of 64x64 -> 128 bit polynomial multiplication |
| 102 | * using vmull.p8 instructions (8x8 -> 16) is taken from the paper |
| 103 | * "Fast Software Polynomial Multiplication on ARM Processors Using |
| 104 | * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and |
| 105 | * Ricardo Dahab (https://hal.inria.fr/hal-01506572) |
| 106 | * |
| 107 | * It has been slightly tweaked for in-order performance, and to allow |
| 108 | * 'rq' to overlap with 'ad' or 'bd'. |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 109 | */ |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 110 | .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l |
| 111 | vext.8 t0l, \ad, \ad, #1 @ A1 |
| 112 | .ifc \b1, t4l |
| 113 | vext.8 t4l, \bd, \bd, #1 @ B1 |
| 114 | .endif |
| 115 | vmull.p8 t0q, t0l, \bd @ F = A1*B |
| 116 | vext.8 t1l, \ad, \ad, #2 @ A2 |
| 117 | vmull.p8 t4q, \ad, \b1 @ E = A*B1 |
| 118 | .ifc \b2, t3l |
| 119 | vext.8 t3l, \bd, \bd, #2 @ B2 |
| 120 | .endif |
| 121 | vmull.p8 t1q, t1l, \bd @ H = A2*B |
| 122 | vext.8 t2l, \ad, \ad, #3 @ A3 |
| 123 | vmull.p8 t3q, \ad, \b2 @ G = A*B2 |
| 124 | veor t0q, t0q, t4q @ L = E + F |
| 125 | .ifc \b3, t4l |
| 126 | vext.8 t4l, \bd, \bd, #3 @ B3 |
| 127 | .endif |
| 128 | vmull.p8 t2q, t2l, \bd @ J = A3*B |
| 129 | veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 |
| 130 | veor t1q, t1q, t3q @ M = G + H |
| 131 | .ifc \b4, t3l |
| 132 | vext.8 t3l, \bd, \bd, #4 @ B4 |
| 133 | .endif |
| 134 | vmull.p8 t4q, \ad, \b3 @ I = A*B3 |
| 135 | veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 |
| 136 | vmull.p8 t3q, \ad, \b4 @ K = A*B4 |
| 137 | vand t0h, t0h, k48 |
| 138 | vand t1h, t1h, k32 |
| 139 | veor t2q, t2q, t4q @ N = I + J |
| 140 | veor t0l, t0l, t0h |
| 141 | veor t1l, t1l, t1h |
| 142 | veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 |
| 143 | vand t2h, t2h, k16 |
| 144 | veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 |
| 145 | vmov.i64 t3h, #0 |
| 146 | vext.8 t0q, t0q, t0q, #15 |
| 147 | veor t2l, t2l, t2h |
| 148 | vext.8 t1q, t1q, t1q, #14 |
| 149 | vmull.p8 \rq, \ad, \bd @ D = A*B |
| 150 | vext.8 t2q, t2q, t2q, #13 |
| 151 | vext.8 t3q, t3q, t3q, #12 |
| 152 | veor t0q, t0q, t1q |
| 153 | veor t2q, t2q, t3q |
| 154 | veor \rq, \rq, t0q |
| 155 | veor \rq, \rq, t2q |
| 156 | .endm |
| 157 | |
| 158 | // |
| 159 | // PMULL (64x64->128) based reduction for CPUs that can do |
| 160 | // it in a single instruction. |
| 161 | // |
| 162 | .macro __pmull_reduce_p64 |
| 163 | vmull.p64 T1, XL_L, MASK |
| 164 | |
| 165 | veor XH_L, XH_L, XM_H |
| 166 | vext.8 T1, T1, T1, #8 |
| 167 | veor XL_H, XL_H, XM_L |
| 168 | veor T1, T1, XL |
| 169 | |
| 170 | vmull.p64 XL, T1_H, MASK |
| 171 | .endm |
| 172 | |
| 173 | // |
| 174 | // Alternative reduction for CPUs that lack support for the |
| 175 | // 64x64->128 PMULL instruction |
| 176 | // |
| 177 | .macro __pmull_reduce_p8 |
| 178 | veor XL_H, XL_H, XM_L |
| 179 | veor XH_L, XH_L, XM_H |
| 180 | |
| 181 | vshl.i64 T1, XL, #57 |
| 182 | vshl.i64 T2, XL, #62 |
| 183 | veor T1, T1, T2 |
| 184 | vshl.i64 T2, XL, #63 |
| 185 | veor T1, T1, T2 |
| 186 | veor XL_H, XL_H, T1_L |
| 187 | veor XH_L, XH_L, T1_H |
| 188 | |
| 189 | vshr.u64 T1, XL, #1 |
| 190 | veor XH, XH, XL |
| 191 | veor XL, XL, T1 |
| 192 | vshr.u64 T1, T1, #6 |
| 193 | vshr.u64 XL, XL, #1 |
| 194 | .endm |
| 195 | |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 196 | .macro ghash_update, pn, enc, aggregate=1, head=1 |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 197 | vld1.64 {XL}, [r1] |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 198 | |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 199 | .if \head |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 200 | /* do the head block first, if supplied */ |
| 201 | ldr ip, [sp] |
| 202 | teq ip, #0 |
| 203 | beq 0f |
| 204 | vld1.64 {T1}, [ip] |
| 205 | teq r0, #0 |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 206 | b 3f |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 207 | .endif |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 208 | |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 209 | 0: .ifc \pn, p64 |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 210 | .if \aggregate |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 211 | tst r0, #3 // skip until #blocks is a |
| 212 | bne 2f // round multiple of 4 |
| 213 | |
| 214 | vld1.8 {XL2-XM2}, [r2]! |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 215 | 1: vld1.8 {T2-T3}, [r2]! |
| 216 | |
| 217 | .ifnb \enc |
| 218 | \enc\()_4x XL2, XM2, T2, T3 |
| 219 | |
| 220 | add ip, r3, #16 |
| 221 | vld1.64 {HH}, [ip, :128]! |
| 222 | vld1.64 {HH3-HH4}, [ip, :128] |
| 223 | |
| 224 | veor SHASH2_p64, SHASH_L, SHASH_H |
| 225 | veor SHASH2_H, HH_L, HH_H |
| 226 | veor HH34_L, HH3_L, HH3_H |
| 227 | veor HH34_H, HH4_L, HH4_H |
| 228 | |
| 229 | vmov.i8 MASK, #0xe1 |
| 230 | vshl.u64 MASK, MASK, #57 |
| 231 | .endif |
| 232 | |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 233 | vrev64.8 XL2, XL2 |
| 234 | vrev64.8 XM2, XM2 |
| 235 | |
| 236 | subs r0, r0, #4 |
| 237 | |
| 238 | vext.8 T1, XL2, XL2, #8 |
| 239 | veor XL2_H, XL2_H, XL_L |
| 240 | veor XL, XL, T1 |
| 241 | |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 242 | vrev64.8 T1, T3 |
| 243 | vrev64.8 T3, T2 |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 244 | |
| 245 | vmull.p64 XH, HH4_H, XL_H // a1 * b1 |
| 246 | veor XL2_H, XL2_H, XL_H |
| 247 | vmull.p64 XL, HH4_L, XL_L // a0 * b0 |
| 248 | vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) |
| 249 | |
| 250 | vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 |
| 251 | veor XM2_L, XM2_L, XM2_H |
| 252 | vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 |
| 253 | vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) |
| 254 | |
| 255 | veor XH, XH, XH2 |
| 256 | veor XL, XL, XL2 |
| 257 | veor XM, XM, XM2 |
| 258 | |
| 259 | vmull.p64 XH2, HH_H, T3_L // a1 * b1 |
| 260 | veor T3_L, T3_L, T3_H |
| 261 | vmull.p64 XL2, HH_L, T3_H // a0 * b0 |
| 262 | vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) |
| 263 | |
| 264 | veor XH, XH, XH2 |
| 265 | veor XL, XL, XL2 |
| 266 | veor XM, XM, XM2 |
| 267 | |
| 268 | vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 |
| 269 | veor T1_L, T1_L, T1_H |
| 270 | vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 |
| 271 | vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) |
| 272 | |
| 273 | veor XH, XH, XH2 |
| 274 | veor XL, XL, XL2 |
| 275 | veor XM, XM, XM2 |
| 276 | |
| 277 | beq 4f |
| 278 | |
| 279 | vld1.8 {XL2-XM2}, [r2]! |
| 280 | |
| 281 | veor T1, XL, XH |
| 282 | veor XM, XM, T1 |
| 283 | |
| 284 | __pmull_reduce_p64 |
| 285 | |
| 286 | veor T1, T1, XH |
| 287 | veor XL, XL, T1 |
| 288 | |
| 289 | b 1b |
| 290 | .endif |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 291 | .endif |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 292 | |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 293 | 2: vld1.8 {T1}, [r2]! |
| 294 | |
| 295 | .ifnb \enc |
| 296 | \enc\()_1x T1 |
| 297 | veor SHASH2_p64, SHASH_L, SHASH_H |
| 298 | vmov.i8 MASK, #0xe1 |
| 299 | vshl.u64 MASK, MASK, #57 |
| 300 | .endif |
| 301 | |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 302 | subs r0, r0, #1 |
| 303 | |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 304 | 3: /* multiply XL by SHASH in GF(2^128) */ |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 305 | vrev64.8 T1, T1 |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 306 | |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 307 | vext.8 IN1, T1, T1, #8 |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 308 | veor T1_L, T1_L, XL_H |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 309 | veor XL, XL, IN1 |
| 310 | |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 311 | __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 312 | veor T1, T1, XL |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 313 | __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 |
| 314 | __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 315 | |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 316 | 4: veor T1, XL, XH |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 317 | veor XM, XM, T1 |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 318 | |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 319 | __pmull_reduce_\pn |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 320 | |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 321 | veor T1, T1, XH |
| 322 | veor XL, XL, T1 |
Ard Biesheuvel | f1e866b | 2015-03-10 09:47:48 +0100 | [diff] [blame] | 323 | |
| 324 | bne 0b |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 325 | .endm |
| 326 | |
| 327 | /* |
| 328 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, |
| 329 | * struct ghash_key const *k, const char *head) |
| 330 | */ |
| 331 | ENTRY(pmull_ghash_update_p64) |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 332 | vld1.64 {SHASH}, [r3]! |
| 333 | vld1.64 {HH}, [r3]! |
| 334 | vld1.64 {HH3-HH4}, [r3] |
| 335 | |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 336 | veor SHASH2_p64, SHASH_L, SHASH_H |
Ard Biesheuvel | 00227e3 | 2018-08-23 15:48:51 +0100 | [diff] [blame] | 337 | veor SHASH2_H, HH_L, HH_H |
| 338 | veor HH34_L, HH3_L, HH3_H |
| 339 | veor HH34_H, HH4_L, HH4_H |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 340 | |
| 341 | vmov.i8 MASK, #0xe1 |
| 342 | vshl.u64 MASK, MASK, #57 |
| 343 | |
| 344 | ghash_update p64 |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 345 | vst1.64 {XL}, [r1] |
| 346 | |
| 347 | bx lr |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 348 | ENDPROC(pmull_ghash_update_p64) |
| 349 | |
| 350 | ENTRY(pmull_ghash_update_p8) |
| 351 | vld1.64 {SHASH}, [r3] |
| 352 | veor SHASH2_p8, SHASH_L, SHASH_H |
| 353 | |
| 354 | vext.8 s1l, SHASH_L, SHASH_L, #1 |
| 355 | vext.8 s2l, SHASH_L, SHASH_L, #2 |
| 356 | vext.8 s3l, SHASH_L, SHASH_L, #3 |
| 357 | vext.8 s4l, SHASH_L, SHASH_L, #4 |
| 358 | vext.8 s1h, SHASH_H, SHASH_H, #1 |
| 359 | vext.8 s2h, SHASH_H, SHASH_H, #2 |
| 360 | vext.8 s3h, SHASH_H, SHASH_H, #3 |
| 361 | vext.8 s4h, SHASH_H, SHASH_H, #4 |
| 362 | |
| 363 | vmov.i64 k16, #0xffff |
| 364 | vmov.i64 k32, #0xffffffff |
| 365 | vmov.i64 k48, #0xffffffffffff |
| 366 | |
| 367 | ghash_update p8 |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 368 | vst1.64 {XL}, [r1] |
| 369 | |
| 370 | bx lr |
Ard Biesheuvel | 3759ee0 | 2017-07-24 11:28:17 +0100 | [diff] [blame] | 371 | ENDPROC(pmull_ghash_update_p8) |
Ard Biesheuvel | b575b5a | 2023-01-16 12:01:48 +0100 | [diff] [blame] | 372 | |
| 373 | e0 .req q9 |
| 374 | e1 .req q10 |
| 375 | e2 .req q11 |
| 376 | e3 .req q12 |
| 377 | e0l .req d18 |
| 378 | e0h .req d19 |
| 379 | e2l .req d22 |
| 380 | e2h .req d23 |
| 381 | e3l .req d24 |
| 382 | e3h .req d25 |
| 383 | ctr .req q13 |
| 384 | ctr0 .req d26 |
| 385 | ctr1 .req d27 |
| 386 | |
| 387 | ek0 .req q14 |
| 388 | ek1 .req q15 |
| 389 | |
| 390 | .macro round, rk:req, regs:vararg |
| 391 | .irp r, \regs |
| 392 | aese.8 \r, \rk |
| 393 | aesmc.8 \r, \r |
| 394 | .endr |
| 395 | .endm |
| 396 | |
| 397 | .macro aes_encrypt, rkp, rounds, regs:vararg |
| 398 | vld1.8 {ek0-ek1}, [\rkp, :128]! |
| 399 | cmp \rounds, #12 |
| 400 | blt .L\@ // AES-128 |
| 401 | |
| 402 | round ek0, \regs |
| 403 | vld1.8 {ek0}, [\rkp, :128]! |
| 404 | round ek1, \regs |
| 405 | vld1.8 {ek1}, [\rkp, :128]! |
| 406 | |
| 407 | beq .L\@ // AES-192 |
| 408 | |
| 409 | round ek0, \regs |
| 410 | vld1.8 {ek0}, [\rkp, :128]! |
| 411 | round ek1, \regs |
| 412 | vld1.8 {ek1}, [\rkp, :128]! |
| 413 | |
| 414 | .L\@: .rept 4 |
| 415 | round ek0, \regs |
| 416 | vld1.8 {ek0}, [\rkp, :128]! |
| 417 | round ek1, \regs |
| 418 | vld1.8 {ek1}, [\rkp, :128]! |
| 419 | .endr |
| 420 | |
| 421 | round ek0, \regs |
| 422 | vld1.8 {ek0}, [\rkp, :128] |
| 423 | |
| 424 | .irp r, \regs |
| 425 | aese.8 \r, ek1 |
| 426 | .endr |
| 427 | .irp r, \regs |
| 428 | veor \r, \r, ek0 |
| 429 | .endr |
| 430 | .endm |
| 431 | |
| 432 | pmull_aes_encrypt: |
| 433 | add ip, r5, #4 |
| 434 | vld1.8 {ctr0}, [r5] // load 12 byte IV |
| 435 | vld1.8 {ctr1}, [ip] |
| 436 | rev r8, r7 |
| 437 | vext.8 ctr1, ctr1, ctr1, #4 |
| 438 | add r7, r7, #1 |
| 439 | vmov.32 ctr1[1], r8 |
| 440 | vmov e0, ctr |
| 441 | |
| 442 | add ip, r3, #64 |
| 443 | aes_encrypt ip, r6, e0 |
| 444 | bx lr |
| 445 | ENDPROC(pmull_aes_encrypt) |
| 446 | |
| 447 | pmull_aes_encrypt_4x: |
| 448 | add ip, r5, #4 |
| 449 | vld1.8 {ctr0}, [r5] |
| 450 | vld1.8 {ctr1}, [ip] |
| 451 | rev r8, r7 |
| 452 | vext.8 ctr1, ctr1, ctr1, #4 |
| 453 | add r7, r7, #1 |
| 454 | vmov.32 ctr1[1], r8 |
| 455 | rev ip, r7 |
| 456 | vmov e0, ctr |
| 457 | add r7, r7, #1 |
| 458 | vmov.32 ctr1[1], ip |
| 459 | rev r8, r7 |
| 460 | vmov e1, ctr |
| 461 | add r7, r7, #1 |
| 462 | vmov.32 ctr1[1], r8 |
| 463 | rev ip, r7 |
| 464 | vmov e2, ctr |
| 465 | add r7, r7, #1 |
| 466 | vmov.32 ctr1[1], ip |
| 467 | vmov e3, ctr |
| 468 | |
| 469 | add ip, r3, #64 |
| 470 | aes_encrypt ip, r6, e0, e1, e2, e3 |
| 471 | bx lr |
| 472 | ENDPROC(pmull_aes_encrypt_4x) |
| 473 | |
| 474 | pmull_aes_encrypt_final: |
| 475 | add ip, r5, #4 |
| 476 | vld1.8 {ctr0}, [r5] |
| 477 | vld1.8 {ctr1}, [ip] |
| 478 | rev r8, r7 |
| 479 | vext.8 ctr1, ctr1, ctr1, #4 |
| 480 | mov r7, #1 << 24 // BE #1 for the tag |
| 481 | vmov.32 ctr1[1], r8 |
| 482 | vmov e0, ctr |
| 483 | vmov.32 ctr1[1], r7 |
| 484 | vmov e1, ctr |
| 485 | |
| 486 | add ip, r3, #64 |
| 487 | aes_encrypt ip, r6, e0, e1 |
| 488 | bx lr |
| 489 | ENDPROC(pmull_aes_encrypt_final) |
| 490 | |
| 491 | .macro enc_1x, in0 |
| 492 | bl pmull_aes_encrypt |
| 493 | veor \in0, \in0, e0 |
| 494 | vst1.8 {\in0}, [r4]! |
| 495 | .endm |
| 496 | |
| 497 | .macro dec_1x, in0 |
| 498 | bl pmull_aes_encrypt |
| 499 | veor e0, e0, \in0 |
| 500 | vst1.8 {e0}, [r4]! |
| 501 | .endm |
| 502 | |
| 503 | .macro enc_4x, in0, in1, in2, in3 |
| 504 | bl pmull_aes_encrypt_4x |
| 505 | |
| 506 | veor \in0, \in0, e0 |
| 507 | veor \in1, \in1, e1 |
| 508 | veor \in2, \in2, e2 |
| 509 | veor \in3, \in3, e3 |
| 510 | |
| 511 | vst1.8 {\in0-\in1}, [r4]! |
| 512 | vst1.8 {\in2-\in3}, [r4]! |
| 513 | .endm |
| 514 | |
| 515 | .macro dec_4x, in0, in1, in2, in3 |
| 516 | bl pmull_aes_encrypt_4x |
| 517 | |
| 518 | veor e0, e0, \in0 |
| 519 | veor e1, e1, \in1 |
| 520 | veor e2, e2, \in2 |
| 521 | veor e3, e3, \in3 |
| 522 | |
| 523 | vst1.8 {e0-e1}, [r4]! |
| 524 | vst1.8 {e2-e3}, [r4]! |
| 525 | .endm |
| 526 | |
| 527 | /* |
| 528 | * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src, |
| 529 | * struct gcm_key const *k, char *dst, |
| 530 | * char *iv, int rounds, u32 counter) |
| 531 | */ |
| 532 | ENTRY(pmull_gcm_encrypt) |
| 533 | push {r4-r8, lr} |
| 534 | ldrd r4, r5, [sp, #24] |
| 535 | ldrd r6, r7, [sp, #32] |
| 536 | |
| 537 | vld1.64 {SHASH}, [r3] |
| 538 | |
| 539 | ghash_update p64, enc, head=0 |
| 540 | vst1.64 {XL}, [r1] |
| 541 | |
| 542 | pop {r4-r8, pc} |
| 543 | ENDPROC(pmull_gcm_encrypt) |
| 544 | |
| 545 | /* |
| 546 | * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src, |
| 547 | * struct gcm_key const *k, char *dst, |
| 548 | * char *iv, int rounds, u32 counter) |
| 549 | */ |
| 550 | ENTRY(pmull_gcm_decrypt) |
| 551 | push {r4-r8, lr} |
| 552 | ldrd r4, r5, [sp, #24] |
| 553 | ldrd r6, r7, [sp, #32] |
| 554 | |
| 555 | vld1.64 {SHASH}, [r3] |
| 556 | |
| 557 | ghash_update p64, dec, head=0 |
| 558 | vst1.64 {XL}, [r1] |
| 559 | |
| 560 | pop {r4-r8, pc} |
| 561 | ENDPROC(pmull_gcm_decrypt) |
| 562 | |
| 563 | /* |
| 564 | * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag, |
| 565 | * struct gcm_key const *k, char *head, |
| 566 | * char *iv, int rounds, u32 counter) |
| 567 | */ |
| 568 | ENTRY(pmull_gcm_enc_final) |
| 569 | push {r4-r8, lr} |
| 570 | ldrd r4, r5, [sp, #24] |
| 571 | ldrd r6, r7, [sp, #32] |
| 572 | |
| 573 | bl pmull_aes_encrypt_final |
| 574 | |
| 575 | cmp r0, #0 |
| 576 | beq .Lenc_final |
| 577 | |
| 578 | mov_l ip, .Lpermute |
| 579 | sub r4, r4, #16 |
| 580 | add r8, ip, r0 |
| 581 | add ip, ip, #32 |
| 582 | add r4, r4, r0 |
| 583 | sub ip, ip, r0 |
| 584 | |
| 585 | vld1.8 {e3}, [r8] // permute vector for key stream |
| 586 | vld1.8 {e2}, [ip] // permute vector for ghash input |
| 587 | |
| 588 | vtbl.8 e3l, {e0}, e3l |
| 589 | vtbl.8 e3h, {e0}, e3h |
| 590 | |
| 591 | vld1.8 {e0}, [r4] // encrypt tail block |
| 592 | veor e0, e0, e3 |
| 593 | vst1.8 {e0}, [r4] |
| 594 | |
| 595 | vtbl.8 T1_L, {e0}, e2l |
| 596 | vtbl.8 T1_H, {e0}, e2h |
| 597 | |
| 598 | vld1.64 {XL}, [r1] |
| 599 | .Lenc_final: |
| 600 | vld1.64 {SHASH}, [r3, :128] |
| 601 | vmov.i8 MASK, #0xe1 |
| 602 | veor SHASH2_p64, SHASH_L, SHASH_H |
| 603 | vshl.u64 MASK, MASK, #57 |
| 604 | mov r0, #1 |
| 605 | bne 3f // process head block first |
| 606 | ghash_update p64, aggregate=0, head=0 |
| 607 | |
| 608 | vrev64.8 XL, XL |
| 609 | vext.8 XL, XL, XL, #8 |
| 610 | veor XL, XL, e1 |
| 611 | |
| 612 | sub r2, r2, #16 // rewind src pointer |
| 613 | vst1.8 {XL}, [r2] // store tag |
| 614 | |
| 615 | pop {r4-r8, pc} |
| 616 | ENDPROC(pmull_gcm_enc_final) |
| 617 | |
| 618 | /* |
| 619 | * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag, |
| 620 | * struct gcm_key const *k, char *head, |
| 621 | * char *iv, int rounds, u32 counter, |
| 622 | * const char *otag, int authsize) |
| 623 | */ |
| 624 | ENTRY(pmull_gcm_dec_final) |
| 625 | push {r4-r8, lr} |
| 626 | ldrd r4, r5, [sp, #24] |
| 627 | ldrd r6, r7, [sp, #32] |
| 628 | |
| 629 | bl pmull_aes_encrypt_final |
| 630 | |
| 631 | cmp r0, #0 |
| 632 | beq .Ldec_final |
| 633 | |
| 634 | mov_l ip, .Lpermute |
| 635 | sub r4, r4, #16 |
| 636 | add r8, ip, r0 |
| 637 | add ip, ip, #32 |
| 638 | add r4, r4, r0 |
| 639 | sub ip, ip, r0 |
| 640 | |
| 641 | vld1.8 {e3}, [r8] // permute vector for key stream |
| 642 | vld1.8 {e2}, [ip] // permute vector for ghash input |
| 643 | |
| 644 | vtbl.8 e3l, {e0}, e3l |
| 645 | vtbl.8 e3h, {e0}, e3h |
| 646 | |
| 647 | vld1.8 {e0}, [r4] |
| 648 | |
| 649 | vtbl.8 T1_L, {e0}, e2l |
| 650 | vtbl.8 T1_H, {e0}, e2h |
| 651 | |
| 652 | veor e0, e0, e3 |
| 653 | vst1.8 {e0}, [r4] |
| 654 | |
| 655 | vld1.64 {XL}, [r1] |
| 656 | .Ldec_final: |
| 657 | vld1.64 {SHASH}, [r3] |
| 658 | vmov.i8 MASK, #0xe1 |
| 659 | veor SHASH2_p64, SHASH_L, SHASH_H |
| 660 | vshl.u64 MASK, MASK, #57 |
| 661 | mov r0, #1 |
| 662 | bne 3f // process head block first |
| 663 | ghash_update p64, aggregate=0, head=0 |
| 664 | |
| 665 | vrev64.8 XL, XL |
| 666 | vext.8 XL, XL, XL, #8 |
| 667 | veor XL, XL, e1 |
| 668 | |
| 669 | mov_l ip, .Lpermute |
| 670 | ldrd r2, r3, [sp, #40] // otag and authsize |
| 671 | vld1.8 {T1}, [r2] |
| 672 | add ip, ip, r3 |
| 673 | vceq.i8 T1, T1, XL // compare tags |
| 674 | vmvn T1, T1 // 0 for eq, -1 for ne |
| 675 | |
| 676 | vld1.8 {e0}, [ip] |
| 677 | vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only |
| 678 | vtbl.8 XL_H, {T1}, e0h |
| 679 | |
| 680 | vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector |
| 681 | vpmin.s8 XL_L, XL_L, XL_L |
| 682 | vmov.32 r0, XL_L[0] // fail if != 0x0 |
| 683 | |
| 684 | pop {r4-r8, pc} |
| 685 | ENDPROC(pmull_gcm_dec_final) |
| 686 | |
| 687 | .section ".rodata", "a", %progbits |
| 688 | .align 5 |
| 689 | .Lpermute: |
| 690 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| 691 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| 692 | .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 |
| 693 | .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f |
| 694 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| 695 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |