Ard Biesheuvel | f569ca1 | 2019-11-08 13:22:24 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause |
| 3 | # |
| 4 | # ==================================================================== |
| 5 | # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL |
| 6 | # project. |
| 7 | # ==================================================================== |
| 8 | # |
| 9 | # This module implements Poly1305 hash for ARMv8. |
| 10 | # |
| 11 | # June 2015 |
| 12 | # |
| 13 | # Numbers are cycles per processed byte with poly1305_blocks alone. |
| 14 | # |
| 15 | # IALU/gcc-4.9 NEON |
| 16 | # |
| 17 | # Apple A7 1.86/+5% 0.72 |
| 18 | # Cortex-A53 2.69/+58% 1.47 |
| 19 | # Cortex-A57 2.70/+7% 1.14 |
| 20 | # Denver 1.64/+50% 1.18(*) |
| 21 | # X-Gene 2.13/+68% 2.27 |
| 22 | # Mongoose 1.77/+75% 1.12 |
| 23 | # Kryo 2.70/+55% 1.13 |
| 24 | # ThunderX2 1.17/+95% 1.36 |
| 25 | # |
| 26 | # (*) estimate based on resources availability is less than 1.0, |
| 27 | # i.e. measured result is worse than expected, presumably binary |
| 28 | # translator is not almighty; |
| 29 | |
| 30 | $flavour=shift; |
| 31 | $output=shift; |
| 32 | |
| 33 | if ($flavour && $flavour ne "void") { |
| 34 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 35 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or |
| 36 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or |
| 37 | die "can't locate arm-xlate.pl"; |
| 38 | |
| 39 | open STDOUT,"| \"$^X\" $xlate $flavour $output"; |
| 40 | } else { |
| 41 | open STDOUT,">$output"; |
| 42 | } |
| 43 | |
| 44 | my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); |
| 45 | my ($mac,$nonce)=($inp,$len); |
| 46 | |
| 47 | my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); |
| 48 | |
| 49 | $code.=<<___; |
| 50 | #ifndef __KERNEL__ |
| 51 | # include "arm_arch.h" |
| 52 | .extern OPENSSL_armcap_P |
| 53 | #endif |
| 54 | |
| 55 | .text |
| 56 | |
| 57 | // forward "declarations" are required for Apple |
| 58 | .globl poly1305_blocks |
| 59 | .globl poly1305_emit |
| 60 | |
| 61 | .globl poly1305_init |
| 62 | .type poly1305_init,%function |
| 63 | .align 5 |
| 64 | poly1305_init: |
| 65 | cmp $inp,xzr |
| 66 | stp xzr,xzr,[$ctx] // zero hash value |
| 67 | stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] |
| 68 | |
| 69 | csel x0,xzr,x0,eq |
| 70 | b.eq .Lno_key |
| 71 | |
| 72 | #ifndef __KERNEL__ |
| 73 | adrp x17,OPENSSL_armcap_P |
| 74 | ldr w17,[x17,#:lo12:OPENSSL_armcap_P] |
| 75 | #endif |
| 76 | |
| 77 | ldp $r0,$r1,[$inp] // load key |
| 78 | mov $s1,#0xfffffffc0fffffff |
| 79 | movk $s1,#0x0fff,lsl#48 |
| 80 | #ifdef __AARCH64EB__ |
| 81 | rev $r0,$r0 // flip bytes |
| 82 | rev $r1,$r1 |
| 83 | #endif |
| 84 | and $r0,$r0,$s1 // &=0ffffffc0fffffff |
| 85 | and $s1,$s1,#-4 |
| 86 | and $r1,$r1,$s1 // &=0ffffffc0ffffffc |
| 87 | mov w#$s1,#-1 |
| 88 | stp $r0,$r1,[$ctx,#32] // save key value |
| 89 | str w#$s1,[$ctx,#48] // impossible key power value |
| 90 | |
| 91 | #ifndef __KERNEL__ |
| 92 | tst w17,#ARMV7_NEON |
| 93 | |
| 94 | adr $d0,.Lpoly1305_blocks |
| 95 | adr $r0,.Lpoly1305_blocks_neon |
| 96 | adr $d1,.Lpoly1305_emit |
| 97 | |
| 98 | csel $d0,$d0,$r0,eq |
| 99 | |
| 100 | # ifdef __ILP32__ |
| 101 | stp w#$d0,w#$d1,[$len] |
| 102 | # else |
| 103 | stp $d0,$d1,[$len] |
| 104 | # endif |
| 105 | #endif |
| 106 | mov x0,#1 |
| 107 | .Lno_key: |
| 108 | ret |
| 109 | .size poly1305_init,.-poly1305_init |
| 110 | |
| 111 | .type poly1305_blocks,%function |
| 112 | .align 5 |
| 113 | poly1305_blocks: |
| 114 | .Lpoly1305_blocks: |
| 115 | ands $len,$len,#-16 |
| 116 | b.eq .Lno_data |
| 117 | |
| 118 | ldp $h0,$h1,[$ctx] // load hash value |
| 119 | ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] |
| 120 | ldp $r0,$r1,[$ctx,#32] // load key value |
| 121 | |
| 122 | #ifdef __AARCH64EB__ |
| 123 | lsr $d0,$h0,#32 |
| 124 | mov w#$d1,w#$h0 |
| 125 | lsr $d2,$h1,#32 |
| 126 | mov w15,w#$h1 |
| 127 | lsr x16,$h2,#32 |
| 128 | #else |
| 129 | mov w#$d0,w#$h0 |
| 130 | lsr $d1,$h0,#32 |
| 131 | mov w#$d2,w#$h1 |
| 132 | lsr x15,$h1,#32 |
| 133 | mov w16,w#$h2 |
| 134 | #endif |
| 135 | |
| 136 | add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 |
| 137 | lsr $d1,$d2,#12 |
| 138 | adds $d0,$d0,$d2,lsl#52 |
| 139 | add $d1,$d1,x15,lsl#14 |
| 140 | adc $d1,$d1,xzr |
| 141 | lsr $d2,x16,#24 |
| 142 | adds $d1,$d1,x16,lsl#40 |
| 143 | adc $d2,$d2,xzr |
| 144 | |
| 145 | cmp x17,#0 // is_base2_26? |
| 146 | add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) |
| 147 | csel $h0,$h0,$d0,eq // choose between radixes |
| 148 | csel $h1,$h1,$d1,eq |
| 149 | csel $h2,$h2,$d2,eq |
| 150 | |
| 151 | .Loop: |
| 152 | ldp $t0,$t1,[$inp],#16 // load input |
| 153 | sub $len,$len,#16 |
| 154 | #ifdef __AARCH64EB__ |
| 155 | rev $t0,$t0 |
| 156 | rev $t1,$t1 |
| 157 | #endif |
| 158 | adds $h0,$h0,$t0 // accumulate input |
| 159 | adcs $h1,$h1,$t1 |
| 160 | |
| 161 | mul $d0,$h0,$r0 // h0*r0 |
| 162 | adc $h2,$h2,$padbit |
| 163 | umulh $d1,$h0,$r0 |
| 164 | |
| 165 | mul $t0,$h1,$s1 // h1*5*r1 |
| 166 | umulh $t1,$h1,$s1 |
| 167 | |
| 168 | adds $d0,$d0,$t0 |
| 169 | mul $t0,$h0,$r1 // h0*r1 |
| 170 | adc $d1,$d1,$t1 |
| 171 | umulh $d2,$h0,$r1 |
| 172 | |
| 173 | adds $d1,$d1,$t0 |
| 174 | mul $t0,$h1,$r0 // h1*r0 |
| 175 | adc $d2,$d2,xzr |
| 176 | umulh $t1,$h1,$r0 |
| 177 | |
| 178 | adds $d1,$d1,$t0 |
| 179 | mul $t0,$h2,$s1 // h2*5*r1 |
| 180 | adc $d2,$d2,$t1 |
| 181 | mul $t1,$h2,$r0 // h2*r0 |
| 182 | |
| 183 | adds $d1,$d1,$t0 |
| 184 | adc $d2,$d2,$t1 |
| 185 | |
| 186 | and $t0,$d2,#-4 // final reduction |
| 187 | and $h2,$d2,#3 |
| 188 | add $t0,$t0,$d2,lsr#2 |
| 189 | adds $h0,$d0,$t0 |
| 190 | adcs $h1,$d1,xzr |
| 191 | adc $h2,$h2,xzr |
| 192 | |
| 193 | cbnz $len,.Loop |
| 194 | |
| 195 | stp $h0,$h1,[$ctx] // store hash value |
| 196 | stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] |
| 197 | |
| 198 | .Lno_data: |
| 199 | ret |
| 200 | .size poly1305_blocks,.-poly1305_blocks |
| 201 | |
| 202 | .type poly1305_emit,%function |
| 203 | .align 5 |
| 204 | poly1305_emit: |
| 205 | .Lpoly1305_emit: |
| 206 | ldp $h0,$h1,[$ctx] // load hash base 2^64 |
| 207 | ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] |
| 208 | ldp $t0,$t1,[$nonce] // load nonce |
| 209 | |
| 210 | #ifdef __AARCH64EB__ |
| 211 | lsr $d0,$h0,#32 |
| 212 | mov w#$d1,w#$h0 |
| 213 | lsr $d2,$h1,#32 |
| 214 | mov w15,w#$h1 |
| 215 | lsr x16,$h2,#32 |
| 216 | #else |
| 217 | mov w#$d0,w#$h0 |
| 218 | lsr $d1,$h0,#32 |
| 219 | mov w#$d2,w#$h1 |
| 220 | lsr x15,$h1,#32 |
| 221 | mov w16,w#$h2 |
| 222 | #endif |
| 223 | |
| 224 | add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 |
| 225 | lsr $d1,$d2,#12 |
| 226 | adds $d0,$d0,$d2,lsl#52 |
| 227 | add $d1,$d1,x15,lsl#14 |
| 228 | adc $d1,$d1,xzr |
| 229 | lsr $d2,x16,#24 |
| 230 | adds $d1,$d1,x16,lsl#40 |
| 231 | adc $d2,$d2,xzr |
| 232 | |
| 233 | cmp $r0,#0 // is_base2_26? |
| 234 | csel $h0,$h0,$d0,eq // choose between radixes |
| 235 | csel $h1,$h1,$d1,eq |
| 236 | csel $h2,$h2,$d2,eq |
| 237 | |
| 238 | adds $d0,$h0,#5 // compare to modulus |
| 239 | adcs $d1,$h1,xzr |
| 240 | adc $d2,$h2,xzr |
| 241 | |
| 242 | tst $d2,#-4 // see if it's carried/borrowed |
| 243 | |
| 244 | csel $h0,$h0,$d0,eq |
| 245 | csel $h1,$h1,$d1,eq |
| 246 | |
| 247 | #ifdef __AARCH64EB__ |
| 248 | ror $t0,$t0,#32 // flip nonce words |
| 249 | ror $t1,$t1,#32 |
| 250 | #endif |
| 251 | adds $h0,$h0,$t0 // accumulate nonce |
| 252 | adc $h1,$h1,$t1 |
| 253 | #ifdef __AARCH64EB__ |
| 254 | rev $h0,$h0 // flip output bytes |
| 255 | rev $h1,$h1 |
| 256 | #endif |
| 257 | stp $h0,$h1,[$mac] // write result |
| 258 | |
| 259 | ret |
| 260 | .size poly1305_emit,.-poly1305_emit |
| 261 | ___ |
| 262 | my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); |
| 263 | my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); |
| 264 | my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); |
| 265 | my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); |
| 266 | my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); |
| 267 | my ($T0,$T1,$MASK) = map("v$_",(29..31)); |
| 268 | |
| 269 | my ($in2,$zeros)=("x16","x17"); |
| 270 | my $is_base2_26 = $zeros; # borrow |
| 271 | |
| 272 | $code.=<<___; |
| 273 | .type poly1305_mult,%function |
| 274 | .align 5 |
| 275 | poly1305_mult: |
| 276 | mul $d0,$h0,$r0 // h0*r0 |
| 277 | umulh $d1,$h0,$r0 |
| 278 | |
| 279 | mul $t0,$h1,$s1 // h1*5*r1 |
| 280 | umulh $t1,$h1,$s1 |
| 281 | |
| 282 | adds $d0,$d0,$t0 |
| 283 | mul $t0,$h0,$r1 // h0*r1 |
| 284 | adc $d1,$d1,$t1 |
| 285 | umulh $d2,$h0,$r1 |
| 286 | |
| 287 | adds $d1,$d1,$t0 |
| 288 | mul $t0,$h1,$r0 // h1*r0 |
| 289 | adc $d2,$d2,xzr |
| 290 | umulh $t1,$h1,$r0 |
| 291 | |
| 292 | adds $d1,$d1,$t0 |
| 293 | mul $t0,$h2,$s1 // h2*5*r1 |
| 294 | adc $d2,$d2,$t1 |
| 295 | mul $t1,$h2,$r0 // h2*r0 |
| 296 | |
| 297 | adds $d1,$d1,$t0 |
| 298 | adc $d2,$d2,$t1 |
| 299 | |
| 300 | and $t0,$d2,#-4 // final reduction |
| 301 | and $h2,$d2,#3 |
| 302 | add $t0,$t0,$d2,lsr#2 |
| 303 | adds $h0,$d0,$t0 |
| 304 | adcs $h1,$d1,xzr |
| 305 | adc $h2,$h2,xzr |
| 306 | |
| 307 | ret |
| 308 | .size poly1305_mult,.-poly1305_mult |
| 309 | |
| 310 | .type poly1305_splat,%function |
| 311 | .align 4 |
| 312 | poly1305_splat: |
| 313 | and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 |
| 314 | ubfx x13,$h0,#26,#26 |
| 315 | extr x14,$h1,$h0,#52 |
| 316 | and x14,x14,#0x03ffffff |
| 317 | ubfx x15,$h1,#14,#26 |
| 318 | extr x16,$h2,$h1,#40 |
| 319 | |
| 320 | str w12,[$ctx,#16*0] // r0 |
| 321 | add w12,w13,w13,lsl#2 // r1*5 |
| 322 | str w13,[$ctx,#16*1] // r1 |
| 323 | add w13,w14,w14,lsl#2 // r2*5 |
| 324 | str w12,[$ctx,#16*2] // s1 |
| 325 | str w14,[$ctx,#16*3] // r2 |
| 326 | add w14,w15,w15,lsl#2 // r3*5 |
| 327 | str w13,[$ctx,#16*4] // s2 |
| 328 | str w15,[$ctx,#16*5] // r3 |
| 329 | add w15,w16,w16,lsl#2 // r4*5 |
| 330 | str w14,[$ctx,#16*6] // s3 |
| 331 | str w16,[$ctx,#16*7] // r4 |
| 332 | str w15,[$ctx,#16*8] // s4 |
| 333 | |
| 334 | ret |
| 335 | .size poly1305_splat,.-poly1305_splat |
| 336 | |
| 337 | #ifdef __KERNEL__ |
| 338 | .globl poly1305_blocks_neon |
| 339 | #endif |
| 340 | .type poly1305_blocks_neon,%function |
| 341 | .align 5 |
| 342 | poly1305_blocks_neon: |
| 343 | .Lpoly1305_blocks_neon: |
| 344 | ldr $is_base2_26,[$ctx,#24] |
| 345 | cmp $len,#128 |
| 346 | b.lo .Lpoly1305_blocks |
| 347 | |
| 348 | .inst 0xd503233f // paciasp |
| 349 | stp x29,x30,[sp,#-80]! |
| 350 | add x29,sp,#0 |
| 351 | |
| 352 | stp d8,d9,[sp,#16] // meet ABI requirements |
| 353 | stp d10,d11,[sp,#32] |
| 354 | stp d12,d13,[sp,#48] |
| 355 | stp d14,d15,[sp,#64] |
| 356 | |
| 357 | cbz $is_base2_26,.Lbase2_64_neon |
| 358 | |
| 359 | ldp w10,w11,[$ctx] // load hash value base 2^26 |
| 360 | ldp w12,w13,[$ctx,#8] |
| 361 | ldr w14,[$ctx,#16] |
| 362 | |
| 363 | tst $len,#31 |
| 364 | b.eq .Leven_neon |
| 365 | |
| 366 | ldp $r0,$r1,[$ctx,#32] // load key value |
| 367 | |
| 368 | add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 |
| 369 | lsr $h1,x12,#12 |
| 370 | adds $h0,$h0,x12,lsl#52 |
| 371 | add $h1,$h1,x13,lsl#14 |
| 372 | adc $h1,$h1,xzr |
| 373 | lsr $h2,x14,#24 |
| 374 | adds $h1,$h1,x14,lsl#40 |
| 375 | adc $d2,$h2,xzr // can be partially reduced... |
| 376 | |
| 377 | ldp $d0,$d1,[$inp],#16 // load input |
| 378 | sub $len,$len,#16 |
| 379 | add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) |
| 380 | |
| 381 | #ifdef __AARCH64EB__ |
| 382 | rev $d0,$d0 |
| 383 | rev $d1,$d1 |
| 384 | #endif |
| 385 | adds $h0,$h0,$d0 // accumulate input |
| 386 | adcs $h1,$h1,$d1 |
| 387 | adc $h2,$h2,$padbit |
| 388 | |
| 389 | bl poly1305_mult |
| 390 | |
| 391 | and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 |
| 392 | ubfx x11,$h0,#26,#26 |
| 393 | extr x12,$h1,$h0,#52 |
| 394 | and x12,x12,#0x03ffffff |
| 395 | ubfx x13,$h1,#14,#26 |
| 396 | extr x14,$h2,$h1,#40 |
| 397 | |
| 398 | b .Leven_neon |
| 399 | |
| 400 | .align 4 |
| 401 | .Lbase2_64_neon: |
| 402 | ldp $r0,$r1,[$ctx,#32] // load key value |
| 403 | |
| 404 | ldp $h0,$h1,[$ctx] // load hash value base 2^64 |
| 405 | ldr $h2,[$ctx,#16] |
| 406 | |
| 407 | tst $len,#31 |
| 408 | b.eq .Linit_neon |
| 409 | |
| 410 | ldp $d0,$d1,[$inp],#16 // load input |
| 411 | sub $len,$len,#16 |
| 412 | add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) |
| 413 | #ifdef __AARCH64EB__ |
| 414 | rev $d0,$d0 |
| 415 | rev $d1,$d1 |
| 416 | #endif |
| 417 | adds $h0,$h0,$d0 // accumulate input |
| 418 | adcs $h1,$h1,$d1 |
| 419 | adc $h2,$h2,$padbit |
| 420 | |
| 421 | bl poly1305_mult |
| 422 | |
| 423 | .Linit_neon: |
| 424 | ldr w17,[$ctx,#48] // first table element |
| 425 | and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 |
| 426 | ubfx x11,$h0,#26,#26 |
| 427 | extr x12,$h1,$h0,#52 |
| 428 | and x12,x12,#0x03ffffff |
| 429 | ubfx x13,$h1,#14,#26 |
| 430 | extr x14,$h2,$h1,#40 |
| 431 | |
| 432 | cmp w17,#-1 // is value impossible? |
| 433 | b.ne .Leven_neon |
| 434 | |
| 435 | fmov ${H0},x10 |
| 436 | fmov ${H1},x11 |
| 437 | fmov ${H2},x12 |
| 438 | fmov ${H3},x13 |
| 439 | fmov ${H4},x14 |
| 440 | |
| 441 | ////////////////////////////////// initialize r^n table |
| 442 | mov $h0,$r0 // r^1 |
| 443 | add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) |
| 444 | mov $h1,$r1 |
| 445 | mov $h2,xzr |
| 446 | add $ctx,$ctx,#48+12 |
| 447 | bl poly1305_splat |
| 448 | |
| 449 | bl poly1305_mult // r^2 |
| 450 | sub $ctx,$ctx,#4 |
| 451 | bl poly1305_splat |
| 452 | |
| 453 | bl poly1305_mult // r^3 |
| 454 | sub $ctx,$ctx,#4 |
| 455 | bl poly1305_splat |
| 456 | |
| 457 | bl poly1305_mult // r^4 |
| 458 | sub $ctx,$ctx,#4 |
| 459 | bl poly1305_splat |
| 460 | sub $ctx,$ctx,#48 // restore original $ctx |
| 461 | b .Ldo_neon |
| 462 | |
| 463 | .align 4 |
| 464 | .Leven_neon: |
| 465 | fmov ${H0},x10 |
| 466 | fmov ${H1},x11 |
| 467 | fmov ${H2},x12 |
| 468 | fmov ${H3},x13 |
| 469 | fmov ${H4},x14 |
| 470 | |
| 471 | .Ldo_neon: |
| 472 | ldp x8,x12,[$inp,#32] // inp[2:3] |
| 473 | subs $len,$len,#64 |
| 474 | ldp x9,x13,[$inp,#48] |
| 475 | add $in2,$inp,#96 |
| 476 | adr $zeros,.Lzeros |
| 477 | |
| 478 | lsl $padbit,$padbit,#24 |
| 479 | add x15,$ctx,#48 |
| 480 | |
| 481 | #ifdef __AARCH64EB__ |
| 482 | rev x8,x8 |
| 483 | rev x12,x12 |
| 484 | rev x9,x9 |
| 485 | rev x13,x13 |
| 486 | #endif |
| 487 | and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 |
| 488 | and x5,x9,#0x03ffffff |
| 489 | ubfx x6,x8,#26,#26 |
| 490 | ubfx x7,x9,#26,#26 |
| 491 | add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 |
| 492 | extr x8,x12,x8,#52 |
| 493 | extr x9,x13,x9,#52 |
| 494 | add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 |
| 495 | fmov $IN23_0,x4 |
| 496 | and x8,x8,#0x03ffffff |
| 497 | and x9,x9,#0x03ffffff |
| 498 | ubfx x10,x12,#14,#26 |
| 499 | ubfx x11,x13,#14,#26 |
| 500 | add x12,$padbit,x12,lsr#40 |
| 501 | add x13,$padbit,x13,lsr#40 |
| 502 | add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 |
| 503 | fmov $IN23_1,x6 |
| 504 | add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 |
| 505 | add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 |
| 506 | fmov $IN23_2,x8 |
| 507 | fmov $IN23_3,x10 |
| 508 | fmov $IN23_4,x12 |
| 509 | |
| 510 | ldp x8,x12,[$inp],#16 // inp[0:1] |
| 511 | ldp x9,x13,[$inp],#48 |
| 512 | |
| 513 | ld1 {$R0,$R1,$S1,$R2},[x15],#64 |
| 514 | ld1 {$S2,$R3,$S3,$R4},[x15],#64 |
| 515 | ld1 {$S4},[x15] |
| 516 | |
| 517 | #ifdef __AARCH64EB__ |
| 518 | rev x8,x8 |
| 519 | rev x12,x12 |
| 520 | rev x9,x9 |
| 521 | rev x13,x13 |
| 522 | #endif |
| 523 | and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 |
| 524 | and x5,x9,#0x03ffffff |
| 525 | ubfx x6,x8,#26,#26 |
| 526 | ubfx x7,x9,#26,#26 |
| 527 | add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 |
| 528 | extr x8,x12,x8,#52 |
| 529 | extr x9,x13,x9,#52 |
| 530 | add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 |
| 531 | fmov $IN01_0,x4 |
| 532 | and x8,x8,#0x03ffffff |
| 533 | and x9,x9,#0x03ffffff |
| 534 | ubfx x10,x12,#14,#26 |
| 535 | ubfx x11,x13,#14,#26 |
| 536 | add x12,$padbit,x12,lsr#40 |
| 537 | add x13,$padbit,x13,lsr#40 |
| 538 | add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 |
| 539 | fmov $IN01_1,x6 |
| 540 | add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 |
| 541 | add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 |
| 542 | movi $MASK.2d,#-1 |
| 543 | fmov $IN01_2,x8 |
| 544 | fmov $IN01_3,x10 |
| 545 | fmov $IN01_4,x12 |
| 546 | ushr $MASK.2d,$MASK.2d,#38 |
| 547 | |
| 548 | b.ls .Lskip_loop |
| 549 | |
| 550 | .align 4 |
| 551 | .Loop_neon: |
| 552 | //////////////////////////////////////////////////////////////// |
| 553 | // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 |
| 554 | // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r |
| 555 | // \___________________/ |
| 556 | // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 |
| 557 | // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r |
| 558 | // \___________________/ \____________________/ |
| 559 | // |
| 560 | // Note that we start with inp[2:3]*r^2. This is because it |
| 561 | // doesn't depend on reduction in previous iteration. |
| 562 | //////////////////////////////////////////////////////////////// |
| 563 | // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 |
| 564 | // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 |
| 565 | // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 |
| 566 | // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 |
| 567 | // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 |
| 568 | |
| 569 | subs $len,$len,#64 |
| 570 | umull $ACC4,$IN23_0,${R4}[2] |
| 571 | csel $in2,$zeros,$in2,lo |
| 572 | umull $ACC3,$IN23_0,${R3}[2] |
| 573 | umull $ACC2,$IN23_0,${R2}[2] |
| 574 | ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) |
| 575 | umull $ACC1,$IN23_0,${R1}[2] |
| 576 | ldp x9,x13,[$in2],#48 |
| 577 | umull $ACC0,$IN23_0,${R0}[2] |
| 578 | #ifdef __AARCH64EB__ |
| 579 | rev x8,x8 |
| 580 | rev x12,x12 |
| 581 | rev x9,x9 |
| 582 | rev x13,x13 |
| 583 | #endif |
| 584 | |
| 585 | umlal $ACC4,$IN23_1,${R3}[2] |
| 586 | and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 |
| 587 | umlal $ACC3,$IN23_1,${R2}[2] |
| 588 | and x5,x9,#0x03ffffff |
| 589 | umlal $ACC2,$IN23_1,${R1}[2] |
| 590 | ubfx x6,x8,#26,#26 |
| 591 | umlal $ACC1,$IN23_1,${R0}[2] |
| 592 | ubfx x7,x9,#26,#26 |
| 593 | umlal $ACC0,$IN23_1,${S4}[2] |
| 594 | add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 |
| 595 | |
| 596 | umlal $ACC4,$IN23_2,${R2}[2] |
| 597 | extr x8,x12,x8,#52 |
| 598 | umlal $ACC3,$IN23_2,${R1}[2] |
| 599 | extr x9,x13,x9,#52 |
| 600 | umlal $ACC2,$IN23_2,${R0}[2] |
| 601 | add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 |
| 602 | umlal $ACC1,$IN23_2,${S4}[2] |
| 603 | fmov $IN23_0,x4 |
| 604 | umlal $ACC0,$IN23_2,${S3}[2] |
| 605 | and x8,x8,#0x03ffffff |
| 606 | |
| 607 | umlal $ACC4,$IN23_3,${R1}[2] |
| 608 | and x9,x9,#0x03ffffff |
| 609 | umlal $ACC3,$IN23_3,${R0}[2] |
| 610 | ubfx x10,x12,#14,#26 |
| 611 | umlal $ACC2,$IN23_3,${S4}[2] |
| 612 | ubfx x11,x13,#14,#26 |
| 613 | umlal $ACC1,$IN23_3,${S3}[2] |
| 614 | add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 |
| 615 | umlal $ACC0,$IN23_3,${S2}[2] |
| 616 | fmov $IN23_1,x6 |
| 617 | |
| 618 | add $IN01_2,$IN01_2,$H2 |
| 619 | add x12,$padbit,x12,lsr#40 |
| 620 | umlal $ACC4,$IN23_4,${R0}[2] |
| 621 | add x13,$padbit,x13,lsr#40 |
| 622 | umlal $ACC3,$IN23_4,${S4}[2] |
| 623 | add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 |
| 624 | umlal $ACC2,$IN23_4,${S3}[2] |
| 625 | add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 |
| 626 | umlal $ACC1,$IN23_4,${S2}[2] |
| 627 | fmov $IN23_2,x8 |
| 628 | umlal $ACC0,$IN23_4,${S1}[2] |
| 629 | fmov $IN23_3,x10 |
| 630 | |
| 631 | //////////////////////////////////////////////////////////////// |
| 632 | // (hash+inp[0:1])*r^4 and accumulate |
| 633 | |
| 634 | add $IN01_0,$IN01_0,$H0 |
| 635 | fmov $IN23_4,x12 |
| 636 | umlal $ACC3,$IN01_2,${R1}[0] |
| 637 | ldp x8,x12,[$inp],#16 // inp[0:1] |
| 638 | umlal $ACC0,$IN01_2,${S3}[0] |
| 639 | ldp x9,x13,[$inp],#48 |
| 640 | umlal $ACC4,$IN01_2,${R2}[0] |
| 641 | umlal $ACC1,$IN01_2,${S4}[0] |
| 642 | umlal $ACC2,$IN01_2,${R0}[0] |
| 643 | #ifdef __AARCH64EB__ |
| 644 | rev x8,x8 |
| 645 | rev x12,x12 |
| 646 | rev x9,x9 |
| 647 | rev x13,x13 |
| 648 | #endif |
| 649 | |
| 650 | add $IN01_1,$IN01_1,$H1 |
| 651 | umlal $ACC3,$IN01_0,${R3}[0] |
| 652 | umlal $ACC4,$IN01_0,${R4}[0] |
| 653 | and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 |
| 654 | umlal $ACC2,$IN01_0,${R2}[0] |
| 655 | and x5,x9,#0x03ffffff |
| 656 | umlal $ACC0,$IN01_0,${R0}[0] |
| 657 | ubfx x6,x8,#26,#26 |
| 658 | umlal $ACC1,$IN01_0,${R1}[0] |
| 659 | ubfx x7,x9,#26,#26 |
| 660 | |
| 661 | add $IN01_3,$IN01_3,$H3 |
| 662 | add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 |
| 663 | umlal $ACC3,$IN01_1,${R2}[0] |
| 664 | extr x8,x12,x8,#52 |
| 665 | umlal $ACC4,$IN01_1,${R3}[0] |
| 666 | extr x9,x13,x9,#52 |
| 667 | umlal $ACC0,$IN01_1,${S4}[0] |
| 668 | add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 |
| 669 | umlal $ACC2,$IN01_1,${R1}[0] |
| 670 | fmov $IN01_0,x4 |
| 671 | umlal $ACC1,$IN01_1,${R0}[0] |
| 672 | and x8,x8,#0x03ffffff |
| 673 | |
| 674 | add $IN01_4,$IN01_4,$H4 |
| 675 | and x9,x9,#0x03ffffff |
| 676 | umlal $ACC3,$IN01_3,${R0}[0] |
| 677 | ubfx x10,x12,#14,#26 |
| 678 | umlal $ACC0,$IN01_3,${S2}[0] |
| 679 | ubfx x11,x13,#14,#26 |
| 680 | umlal $ACC4,$IN01_3,${R1}[0] |
| 681 | add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 |
| 682 | umlal $ACC1,$IN01_3,${S3}[0] |
| 683 | fmov $IN01_1,x6 |
| 684 | umlal $ACC2,$IN01_3,${S4}[0] |
| 685 | add x12,$padbit,x12,lsr#40 |
| 686 | |
| 687 | umlal $ACC3,$IN01_4,${S4}[0] |
| 688 | add x13,$padbit,x13,lsr#40 |
| 689 | umlal $ACC0,$IN01_4,${S1}[0] |
| 690 | add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 |
| 691 | umlal $ACC4,$IN01_4,${R0}[0] |
| 692 | add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 |
| 693 | umlal $ACC1,$IN01_4,${S2}[0] |
| 694 | fmov $IN01_2,x8 |
| 695 | umlal $ACC2,$IN01_4,${S3}[0] |
| 696 | fmov $IN01_3,x10 |
| 697 | fmov $IN01_4,x12 |
| 698 | |
| 699 | ///////////////////////////////////////////////////////////////// |
| 700 | // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein |
| 701 | // and P. Schwabe |
| 702 | // |
| 703 | // [see discussion in poly1305-armv4 module] |
| 704 | |
| 705 | ushr $T0.2d,$ACC3,#26 |
| 706 | xtn $H3,$ACC3 |
| 707 | ushr $T1.2d,$ACC0,#26 |
| 708 | and $ACC0,$ACC0,$MASK.2d |
| 709 | add $ACC4,$ACC4,$T0.2d // h3 -> h4 |
| 710 | bic $H3,#0xfc,lsl#24 // &=0x03ffffff |
| 711 | add $ACC1,$ACC1,$T1.2d // h0 -> h1 |
| 712 | |
| 713 | ushr $T0.2d,$ACC4,#26 |
| 714 | xtn $H4,$ACC4 |
| 715 | ushr $T1.2d,$ACC1,#26 |
| 716 | xtn $H1,$ACC1 |
| 717 | bic $H4,#0xfc,lsl#24 |
| 718 | add $ACC2,$ACC2,$T1.2d // h1 -> h2 |
| 719 | |
| 720 | add $ACC0,$ACC0,$T0.2d |
| 721 | shl $T0.2d,$T0.2d,#2 |
| 722 | shrn $T1.2s,$ACC2,#26 |
| 723 | xtn $H2,$ACC2 |
| 724 | add $ACC0,$ACC0,$T0.2d // h4 -> h0 |
| 725 | bic $H1,#0xfc,lsl#24 |
| 726 | add $H3,$H3,$T1.2s // h2 -> h3 |
| 727 | bic $H2,#0xfc,lsl#24 |
| 728 | |
| 729 | shrn $T0.2s,$ACC0,#26 |
| 730 | xtn $H0,$ACC0 |
| 731 | ushr $T1.2s,$H3,#26 |
| 732 | bic $H3,#0xfc,lsl#24 |
| 733 | bic $H0,#0xfc,lsl#24 |
| 734 | add $H1,$H1,$T0.2s // h0 -> h1 |
| 735 | add $H4,$H4,$T1.2s // h3 -> h4 |
| 736 | |
| 737 | b.hi .Loop_neon |
| 738 | |
| 739 | .Lskip_loop: |
| 740 | dup $IN23_2,${IN23_2}[0] |
| 741 | add $IN01_2,$IN01_2,$H2 |
| 742 | |
| 743 | //////////////////////////////////////////////////////////////// |
| 744 | // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 |
| 745 | |
| 746 | adds $len,$len,#32 |
| 747 | b.ne .Long_tail |
| 748 | |
| 749 | dup $IN23_2,${IN01_2}[0] |
| 750 | add $IN23_0,$IN01_0,$H0 |
| 751 | add $IN23_3,$IN01_3,$H3 |
| 752 | add $IN23_1,$IN01_1,$H1 |
| 753 | add $IN23_4,$IN01_4,$H4 |
| 754 | |
| 755 | .Long_tail: |
| 756 | dup $IN23_0,${IN23_0}[0] |
| 757 | umull2 $ACC0,$IN23_2,${S3} |
| 758 | umull2 $ACC3,$IN23_2,${R1} |
| 759 | umull2 $ACC4,$IN23_2,${R2} |
| 760 | umull2 $ACC2,$IN23_2,${R0} |
| 761 | umull2 $ACC1,$IN23_2,${S4} |
| 762 | |
| 763 | dup $IN23_1,${IN23_1}[0] |
| 764 | umlal2 $ACC0,$IN23_0,${R0} |
| 765 | umlal2 $ACC2,$IN23_0,${R2} |
| 766 | umlal2 $ACC3,$IN23_0,${R3} |
| 767 | umlal2 $ACC4,$IN23_0,${R4} |
| 768 | umlal2 $ACC1,$IN23_0,${R1} |
| 769 | |
| 770 | dup $IN23_3,${IN23_3}[0] |
| 771 | umlal2 $ACC0,$IN23_1,${S4} |
| 772 | umlal2 $ACC3,$IN23_1,${R2} |
| 773 | umlal2 $ACC2,$IN23_1,${R1} |
| 774 | umlal2 $ACC4,$IN23_1,${R3} |
| 775 | umlal2 $ACC1,$IN23_1,${R0} |
| 776 | |
| 777 | dup $IN23_4,${IN23_4}[0] |
| 778 | umlal2 $ACC3,$IN23_3,${R0} |
| 779 | umlal2 $ACC4,$IN23_3,${R1} |
| 780 | umlal2 $ACC0,$IN23_3,${S2} |
| 781 | umlal2 $ACC1,$IN23_3,${S3} |
| 782 | umlal2 $ACC2,$IN23_3,${S4} |
| 783 | |
| 784 | umlal2 $ACC3,$IN23_4,${S4} |
| 785 | umlal2 $ACC0,$IN23_4,${S1} |
| 786 | umlal2 $ACC4,$IN23_4,${R0} |
| 787 | umlal2 $ACC1,$IN23_4,${S2} |
| 788 | umlal2 $ACC2,$IN23_4,${S3} |
| 789 | |
| 790 | b.eq .Lshort_tail |
| 791 | |
| 792 | //////////////////////////////////////////////////////////////// |
| 793 | // (hash+inp[0:1])*r^4:r^3 and accumulate |
| 794 | |
| 795 | add $IN01_0,$IN01_0,$H0 |
| 796 | umlal $ACC3,$IN01_2,${R1} |
| 797 | umlal $ACC0,$IN01_2,${S3} |
| 798 | umlal $ACC4,$IN01_2,${R2} |
| 799 | umlal $ACC1,$IN01_2,${S4} |
| 800 | umlal $ACC2,$IN01_2,${R0} |
| 801 | |
| 802 | add $IN01_1,$IN01_1,$H1 |
| 803 | umlal $ACC3,$IN01_0,${R3} |
| 804 | umlal $ACC0,$IN01_0,${R0} |
| 805 | umlal $ACC4,$IN01_0,${R4} |
| 806 | umlal $ACC1,$IN01_0,${R1} |
| 807 | umlal $ACC2,$IN01_0,${R2} |
| 808 | |
| 809 | add $IN01_3,$IN01_3,$H3 |
| 810 | umlal $ACC3,$IN01_1,${R2} |
| 811 | umlal $ACC0,$IN01_1,${S4} |
| 812 | umlal $ACC4,$IN01_1,${R3} |
| 813 | umlal $ACC1,$IN01_1,${R0} |
| 814 | umlal $ACC2,$IN01_1,${R1} |
| 815 | |
| 816 | add $IN01_4,$IN01_4,$H4 |
| 817 | umlal $ACC3,$IN01_3,${R0} |
| 818 | umlal $ACC0,$IN01_3,${S2} |
| 819 | umlal $ACC4,$IN01_3,${R1} |
| 820 | umlal $ACC1,$IN01_3,${S3} |
| 821 | umlal $ACC2,$IN01_3,${S4} |
| 822 | |
| 823 | umlal $ACC3,$IN01_4,${S4} |
| 824 | umlal $ACC0,$IN01_4,${S1} |
| 825 | umlal $ACC4,$IN01_4,${R0} |
| 826 | umlal $ACC1,$IN01_4,${S2} |
| 827 | umlal $ACC2,$IN01_4,${S3} |
| 828 | |
| 829 | .Lshort_tail: |
| 830 | //////////////////////////////////////////////////////////////// |
| 831 | // horizontal add |
| 832 | |
| 833 | addp $ACC3,$ACC3,$ACC3 |
| 834 | ldp d8,d9,[sp,#16] // meet ABI requirements |
| 835 | addp $ACC0,$ACC0,$ACC0 |
| 836 | ldp d10,d11,[sp,#32] |
| 837 | addp $ACC4,$ACC4,$ACC4 |
| 838 | ldp d12,d13,[sp,#48] |
| 839 | addp $ACC1,$ACC1,$ACC1 |
| 840 | ldp d14,d15,[sp,#64] |
| 841 | addp $ACC2,$ACC2,$ACC2 |
| 842 | ldr x30,[sp,#8] |
Ard Biesheuvel | f569ca1 | 2019-11-08 13:22:24 +0100 | [diff] [blame] | 843 | |
| 844 | //////////////////////////////////////////////////////////////// |
| 845 | // lazy reduction, but without narrowing |
| 846 | |
| 847 | ushr $T0.2d,$ACC3,#26 |
| 848 | and $ACC3,$ACC3,$MASK.2d |
| 849 | ushr $T1.2d,$ACC0,#26 |
| 850 | and $ACC0,$ACC0,$MASK.2d |
| 851 | |
| 852 | add $ACC4,$ACC4,$T0.2d // h3 -> h4 |
| 853 | add $ACC1,$ACC1,$T1.2d // h0 -> h1 |
| 854 | |
| 855 | ushr $T0.2d,$ACC4,#26 |
| 856 | and $ACC4,$ACC4,$MASK.2d |
| 857 | ushr $T1.2d,$ACC1,#26 |
| 858 | and $ACC1,$ACC1,$MASK.2d |
| 859 | add $ACC2,$ACC2,$T1.2d // h1 -> h2 |
| 860 | |
| 861 | add $ACC0,$ACC0,$T0.2d |
| 862 | shl $T0.2d,$T0.2d,#2 |
| 863 | ushr $T1.2d,$ACC2,#26 |
| 864 | and $ACC2,$ACC2,$MASK.2d |
| 865 | add $ACC0,$ACC0,$T0.2d // h4 -> h0 |
| 866 | add $ACC3,$ACC3,$T1.2d // h2 -> h3 |
| 867 | |
| 868 | ushr $T0.2d,$ACC0,#26 |
| 869 | and $ACC0,$ACC0,$MASK.2d |
| 870 | ushr $T1.2d,$ACC3,#26 |
| 871 | and $ACC3,$ACC3,$MASK.2d |
| 872 | add $ACC1,$ACC1,$T0.2d // h0 -> h1 |
| 873 | add $ACC4,$ACC4,$T1.2d // h3 -> h4 |
| 874 | |
| 875 | //////////////////////////////////////////////////////////////// |
| 876 | // write the result, can be partially reduced |
| 877 | |
| 878 | st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 |
| 879 | mov x4,#1 |
| 880 | st1 {$ACC4}[0],[$ctx] |
| 881 | str x4,[$ctx,#8] // set is_base2_26 |
| 882 | |
| 883 | ldr x29,[sp],#80 |
Ard Biesheuvel | 519a0d7 | 2020-10-27 00:00:27 +0100 | [diff] [blame] | 884 | .inst 0xd50323bf // autiasp |
Ard Biesheuvel | f569ca1 | 2019-11-08 13:22:24 +0100 | [diff] [blame] | 885 | ret |
| 886 | .size poly1305_blocks_neon,.-poly1305_blocks_neon |
| 887 | |
| 888 | .align 5 |
| 889 | .Lzeros: |
| 890 | .long 0,0,0,0,0,0,0,0 |
| 891 | .asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" |
| 892 | .align 2 |
| 893 | #if !defined(__KERNEL__) && !defined(_WIN64) |
| 894 | .comm OPENSSL_armcap_P,4,4 |
| 895 | .hidden OPENSSL_armcap_P |
| 896 | #endif |
| 897 | ___ |
| 898 | |
| 899 | foreach (split("\n",$code)) { |
| 900 | s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or |
| 901 | s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or |
| 902 | (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or |
| 903 | (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or |
| 904 | (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or |
| 905 | (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or |
| 906 | (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); |
| 907 | |
| 908 | s/\.[124]([sd])\[/.$1\[/; |
| 909 | s/w#x([0-9]+)/w$1/g; |
| 910 | |
| 911 | print $_,"\n"; |
| 912 | } |
| 913 | close STDOUT; |