2 # Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
20 # work. While it does improve RSA sign performance by 20-30% (less for
21 # longer keys) on most processors, for some reason RSA2048 is not
22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23 # instruction issue rate is limited on processor in question, meaning
24 # that dedicated squaring procedure is a must. Well, actually all
25 # contemporary AArch64 processors seem to have limited multiplication
26 # issue rate, i.e. they can't issue multiplication every cycle, which
27 # explains moderate improvement coefficients in comparison to
28 # compiler-generated code. Recall that compiler is instructed to use
29 # umulh and therefore uses same amount of multiplication instructions
30 # to do the job. Assembly's edge is to minimize number of "collateral"
31 # instructions and of course instruction scheduling.
35 # Squaring procedure that handles lengths divisible by 8 improves
36 # RSA/DSA performance by 25-40-60% depending on processor and key
37 # length. Overall improvement coefficients are always positive in
38 # comparison to compiler-generated code. On Cortex-A57 improvement
39 # is still modest on longest key lengths, while others exhibit e.g.
40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41 # on Cortex-A57 and ~60-100% faster on others.
43 # $output is the last argument if it looks like a file (it has an extension)
44 # $flavour is the first argument if it doesn't look like a file
45 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51 die "can't locate arm-xlate.pl";
53 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54 or die "can't call $xlate: $1";
57 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
58 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
62 $rp="x0"; # BN_ULONG *rp,
63 $ap="x1"; # const BN_ULONG *ap,
64 $bp="x2"; # const BN_ULONG *bp,
65 $np="x3"; # const BN_ULONG *np,
66 $n0="x4"; # const BN_ULONG *n0,
67 $num="x5"; # int num);
71 # include "arm_arch.h"
72 .extern OPENSSL_armv8_rsa_neonized
73 .hidden OPENSSL_armv8_rsa_neonized
78 .type bn_mul_mont,%function
87 adrp x17,OPENSSL_armv8_rsa_neonized
88 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
89 cbnz w17, bn_mul8x_mont_neon
99 stp x29,x30,[sp,#-64]!
105 ldr $m0,[$bp],#8 // bp[0]
106 sub $tp,sp,$num,lsl#3
107 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
110 and $tp,$tp,#-16 // ABI says so
111 ldp $hi1,$nj,[$np],#16 // np[0..1]
113 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
114 sub $j,$num,#16 // j=num-2
116 mul $alo,$aj,$m0 // ap[1]*bp[0]
119 mul $m1,$lo0,$n0 // "tp[0]"*n0
122 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
124 mul $nlo,$nj,$m1 // np[1]*m1
125 // (*) adds $lo1,$lo1,$lo0 // discarded
126 // (*) As for removal of first multiplication and addition
127 // instructions. The outcome of first addition is
128 // guaranteed to be zero, which leaves two computationally
129 // significant outcomes: it either carries or not. Then
130 // question is when does it carry? Is there alternative
131 // way to deduce it? If you follow operations, you can
132 // observe that condition for carry is quite simple:
133 // $lo0 being non-zero. So that carry can be calculated
134 // by adding -1 to $lo0. That's what next instruction does.
135 subs xzr,$lo0,#1 // (*)
148 mul $alo,$aj,$m0 // ap[j]*bp[0]
153 mul $nlo,$nj,$m1 // np[j]*m1
156 str $lo1,[$tp],#8 // tp[j-1]
161 sub $ap,$ap,$num // rewind $ap
165 sub $np,$np,$num // rewind $np
169 sub $i,$num,#8 // i=num-1
172 adc $ovf,xzr,xzr // upmost overflow bit
176 ldr $m0,[$bp],#8 // bp[i]
177 ldp $hi0,$aj,[$ap],#16
178 ldr $tj,[sp] // tp[0]
181 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
182 sub $j,$num,#16 // j=num-2
184 ldp $hi1,$nj,[$np],#16
185 mul $alo,$aj,$m0 // ap[1]*bp[i]
193 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
195 mul $nlo,$nj,$m1 // np[1]*m1
196 // (*) adds $lo1,$lo1,$lo0
197 subs xzr,$lo0,#1 // (*)
204 ldr $tj,[$tp],#8 // tp[j]
213 mul $alo,$aj,$m0 // ap[j]*bp[i]
218 mul $nlo,$nj,$m1 // np[j]*m1
221 stur $lo1,[$tp,#-16] // tp[j-1]
225 ldr $tj,[$tp],#8 // tp[j]
228 sub $ap,$ap,$num // rewind $ap
232 sub $np,$np,$num // rewind $np
241 adc $ovf,$ovf,xzr // upmost overflow bit
242 stp $lo1,$hi1,[$tp,#-16]
246 // Final step. We see if result is larger than modulus, and
247 // if it is, subtract the modulus. But comparison implies
248 // subtraction. So we subtract modulus, see if it borrowed,
249 // and conditionally copy original value.
250 ldr $tj,[sp] // tp[0]
252 ldr $nj,[$np],#8 // np[0]
253 subs $j,$num,#8 // j=num-1 and clear borrow
256 sbcs $aj,$tj,$nj // tp[j]-np[j]
260 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
264 sbcs $ovf,$ovf,xzr // did it borrow?
265 str $aj,[$ap],#8 // rp[num-1]
267 ldr $tj,[sp] // tp[0]
269 ldr $aj,[$rp],#8 // rp[0]
270 sub $num,$num,#8 // num--
273 sub $num,$num,#8 // num--
274 csel $nj,$tj,$aj,lo // did it borrow?
277 stur xzr,[$tp,#-16] // wipe tp
279 cbnz $num,.Lcond_copy
282 stur xzr,[$tp,#-8] // wipe tp
285 ldp x19,x20,[x29,#16]
287 ldp x21,x22,[x29,#32]
289 ldp x23,x24,[x29,#48]
292 .size bn_mul_mont,.-bn_mul_mont
295 my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
296 my ($Z,$Temp)=("v4.16b","v5");
297 my @ACC=map("v$_",(6..13));
298 my ($Bi,$Ni,$M0)=map("v$_",(28..30));
305 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
306 my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
309 .type bn_mul8x_mont_neon,%function
312 stp x29,x30,[sp,#-80]!
319 eor $zero.16b,$zero.16b,$zero.16b
323 eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
325 eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
326 sub $toutptr,$toutptr,$num,lsl#4
327 eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
328 and $toutptr,$toutptr,#-64
329 eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
330 mov sp,$toutptr // alloca
331 eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
332 add $toutptr,$toutptr,#256
333 eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
335 eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
336 eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
339 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
340 subs $inner,$inner,#8
341 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
342 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
343 st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
347 ld1 {$A0.4s,$A1.4s},[$aptr],#32
355 ldr $sBi,[$bptr],#4 // *b++
358 ld1 {$N0.4s,$N1.4s},[$nptr],#32
360 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
361 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
362 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
363 shl $Ni.2d,@ACC[0].2d,#16
364 ext $Ni.16b,$Ni.16b,$Ni.16b,#8
365 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
366 add $Ni.2d,$Ni.2d,@ACC[0].2d
367 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
368 mul $Ni.2s,$Ni.2s,$M0.2s
369 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
370 st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
371 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
373 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
377 ldr $sBi,[$bptr],#4 // *b++
378 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
379 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
381 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
382 ushr $temp.2d,@ACC[0].2d,#16
383 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
384 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
385 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
386 add @ACC[0].2d,@ACC[0].2d,$temp.2d
387 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
388 ushr @ACC[0].2d,@ACC[0].2d,#16
389 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
390 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
391 add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
392 ins @ACC[1].d[0],$ACCTemp.d[0]
393 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
395 push(@ACC,shift(@ACC)); $i++;
397 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
398 ld1 {@ACC[7].2d},[$tinptr],#16
399 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
400 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
401 shl $Ni.2d,@ACC[0].2d,#16
402 ext $Ni.16b,$Ni.16b,$Ni.16b,#8
403 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
404 add $Ni.2d,$Ni.2d,@ACC[0].2d
405 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
406 mul $Ni.2s,$Ni.2s,$M0.2s
407 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
408 st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
409 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
411 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
415 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
416 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
417 ld1 {$A0.4s,$A1.4s},[$aptr],#32
418 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
419 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
420 mov $Temp.16b,@ACC[0].16b
421 ushr $Temp.2d,$Temp.2d,#16
422 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
423 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
424 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
425 add @ACC[0].2d,@ACC[0].2d,$Temp.2d
426 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
427 ushr @ACC[0].2d,@ACC[0].2d,#16
428 eor $temp.16b,$temp.16b,$temp.16b
429 ins @ACC[0].d[1],$temp.d[0]
430 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
431 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
432 add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
433 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
434 add $bnptr,sp,#8 // rewind
436 push(@ACC,shift(@ACC));
443 subs $inner,$inner,#8
444 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
445 ld1 {@ACC[7].2d},[$tinptr]
446 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
447 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
448 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
449 ld1 {$N0.4s,$N1.4s},[$nptr],#32
450 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
452 add $tinptr,$tinptr,#16 // don't advance in last iteration
454 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
455 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
456 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
457 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
459 for ($i=1; $i<8; $i++) {
461 ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
462 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
463 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
464 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
465 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
466 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
467 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
468 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
469 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
470 st1 {@ACC[0].2d},[$toutptr],#16
472 push(@ACC,shift(@ACC));
474 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
475 ld1 {@ACC[7].2d},[$tinptr]
476 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
477 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
478 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
480 add $tinptr,$tinptr,#16 // don't advance in last iteration
482 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
483 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
484 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
485 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
486 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
490 b.ne .LInner_after_rewind$i
491 sub $aptr,$aptr,$num,lsl#2 // rewind
492 .LInner_after_rewind$i:
493 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
494 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
495 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
496 ld1 {$A0.4s,$A1.4s},[$aptr],#32
497 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
498 add $bnptr,sp,#8 // rewind
499 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
500 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
501 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
502 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
503 st1 {@ACC[0].2d},[$toutptr],#16
504 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
508 push(@ACC,shift(@ACC));
511 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
512 eor $N0.16b,$N0.16b,$N0.16b // $N0
513 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
514 eor $N1.16b,$N1.16b,$N1.16b // $N1
515 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
516 st1 {@ACC[6].2d},[$toutptr]
518 subs $outer,$outer,#8
519 ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
520 ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
521 ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
522 ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
524 b.eq .LInner_8n_jump_2steps
525 sub $nptr,$nptr,$num,lsl#2 // rewind
528 .LInner_8n_jump_2steps:
530 st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
531 mov $Temp.16b,@ACC[0].16b
532 ushr $temp.2d,@ACC[0].2d,#16
533 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
534 st1 {$N0.2d,$N1.2d}, [sp],#32
535 add @ACC[0].2d,@ACC[0].2d,$temp.2d
536 st1 {$N0.2d,$N1.2d}, [sp],#32
537 ushr $temp.2d,@ACC[0].2d,#16
538 st1 {$N0.2d,$N1.2d}, [sp],#32
539 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
540 ins $temp.d[1],$zero.d[0]
547 add @ACC[0].2d,@ACC[0].2d,$temp.2d
548 mov $Temp.16b,@ACC[0].16b
549 ushr $temp.2d,@ACC[0].2d,#16
550 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
551 ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
552 add @ACC[0].2d,@ACC[0].2d,$temp.2d
553 ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
554 ushr $temp.2d,@ACC[0].2d,#16
555 ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
556 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
557 ins $temp.d[1],$zero.d[0]
561 for ($i=1; $i<8; $i++) {
563 add @ACC[1].2d,@ACC[1].2d,$temp.2d
564 st1 {@ACC[0].s}[0], [$toutptr],#4
565 ushr $temp.2d,@ACC[1].2d,#16
566 mov $Temp.16b,@ACC[1].16b
567 ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
568 add @ACC[1].2d,@ACC[1].2d,$temp.2d
569 ushr $temp.2d,@ACC[1].2d,#16
570 zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
571 ins $temp.d[1],$zero.d[0]
573 push(@ACC,shift(@ACC));
575 push(@ACC,shift(@ACC));
577 ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
578 subs $inner,$inner,#8
579 st1 {@ACC[7].s}[0], [$toutptr],#4
582 st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
583 sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
584 subs $aptr,sp,#0 // clear carry flag
585 add $bptr,sp,$num,lsl#2
591 ldp w10,w11,[$nptr],#8
598 stp w10,w11,[$rptr],#8
601 ldr w10, [$aptr] // load top-most bit
603 eor v0.16b,v0.16b,v0.16b
604 sub x11,$bptr,x11 // this is num*4
605 eor v1.16b,v1.16b,v1.16b
607 sub $rptr,$rptr,x11 // rewind $rptr
608 mov $nptr,$bptr // second 3/4th of frame
609 sbcs w10,w10,wzr // result is carry flag
623 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
624 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
628 stp w10,w11,[$rptr],#8
639 st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
640 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
641 sub x17,$bptr,$aptr // preserves carry
643 stp w10,w11,[$rptr],#8
644 cbnz x17,.LNEON_copy_n_zap
654 .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
658 ########################################################################
659 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
661 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
662 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
663 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
664 my ($cnt,$carry,$topmost)=("x27","x28","x30");
665 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
668 .type __bn_sqr8x_mont,%function
674 .inst 0xd503233f // paciasp
675 stp x29,x30,[sp,#-128]!
682 stp $rp,$np,[sp,#96] // offload rp and np
684 ldp $a0,$a1,[$ap,#8*0]
685 ldp $a2,$a3,[$ap,#8*2]
686 ldp $a4,$a5,[$ap,#8*4]
687 ldp $a6,$a7,[$ap,#8*6]
689 sub $tp,sp,$num,lsl#4
698 stp xzr,xzr,[$tp,#8*0]
699 stp xzr,xzr,[$tp,#8*2]
700 stp xzr,xzr,[$tp,#8*4]
701 stp xzr,xzr,[$tp,#8*6]
703 stp xzr,xzr,[$tp,#8*8]
704 stp xzr,xzr,[$tp,#8*10]
705 stp xzr,xzr,[$tp,#8*12]
706 stp xzr,xzr,[$tp,#8*14]
708 cbnz $cnt,.Lsqr8x_zero
721 str $n0,[x29,#112] // offload n0
723 // Multiply everything but a[i]*a[i]
755 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
759 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
766 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
773 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
774 adc $acc0,xzr,xzr // t[8]
775 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
782 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
795 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
802 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
803 adc $acc1,xzr,xzr // t[9]
809 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
820 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
827 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
828 adc $acc2,xzr,xzr // t[10]
832 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
841 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
848 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
849 adc $acc3,xzr,xzr // t[11]
851 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
858 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
864 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
865 adc $acc4,xzr,xzr // t[12]
869 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
874 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
876 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
877 adc $acc5,xzr,xzr // t[13]
879 sub $cnt,$ap_end,$ap // done yet?
883 sub $t0,$ap_end,$num // rewinded ap
884 adc $acc6,xzr,xzr // t[14]
887 cbz $cnt,.Lsqr8x_outer_break
890 ldp $a0,$a1,[$tp,#8*0]
891 ldp $a2,$a3,[$tp,#8*2]
892 ldp $a4,$a5,[$tp,#8*4]
893 ldp $a6,$a7,[$tp,#8*6]
896 ldp $a0,$a1,[$ap,#8*0]
899 ldp $a2,$a3,[$ap,#8*2]
902 ldp $a4,$a5,[$ap,#8*4]
906 ldp $a6,$a7,[$ap,#8*6]
908 //adc $carry,xzr,xzr // moved below
920 // a[f]a[1]........................
922 // a[f]a[2]........................
924 // a[f]a[3]........................
926 // a[f]a[4]........................
928 // a[f]a[5]........................
930 // a[f]a[6]........................
932 // a[f]a[7]........................
935 adc $carry,xzr,xzr // carry bit, modulo-scheduled
956 adc $carry,$carry,xzr
970 adcs $acc7,$carry,$t3
971 //adc $carry,xzr,xzr // moved above
972 cbnz $cnt,.Lsqr8x_mul
973 // note that carry flag is guaranteed
974 // to be zero at this point
975 cmp $ap,$ap_end // done yet?
978 ldp $a0,$a1,[$tp,#8*0]
979 ldp $a2,$a3,[$tp,#8*2]
980 ldp $a4,$a5,[$tp,#8*4]
981 ldp $a6,$a7,[$tp,#8*6]
985 ldp $a0,$a1,[$ap,#8*0]
988 ldp $a2,$a3,[$ap,#8*2]
991 ldp $a4,$a5,[$ap,#8*4]
995 ldp $a6,$a7,[$ap,#8*6]
997 //adc $carry,xzr,xzr // moved above
1002 ldp $a0,$a1,[$rp,#8*0]
1004 ldp $a2,$a3,[$rp,#8*2]
1005 sub $t0,$ap_end,$ap // is it last iteration?
1006 ldp $a4,$a5,[$rp,#8*4]
1008 ldp $a6,$a7,[$rp,#8*6]
1009 cbz $t0,.Lsqr8x_outer_loop
1011 stp $acc0,$acc1,[$tp,#8*0]
1012 ldp $acc0,$acc1,[$t1,#8*0]
1013 stp $acc2,$acc3,[$tp,#8*2]
1014 ldp $acc2,$acc3,[$t1,#8*2]
1015 stp $acc4,$acc5,[$tp,#8*4]
1016 ldp $acc4,$acc5,[$t1,#8*4]
1017 stp $acc6,$acc7,[$tp,#8*6]
1019 ldp $acc6,$acc7,[$t1,#8*6]
1020 b .Lsqr8x_outer_loop
1023 .Lsqr8x_outer_break:
1024 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1025 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
1026 ldp $t1,$t2,[sp,#8*1]
1027 ldp $a5,$a7,[$t0,#8*2]
1029 ldp $t3,$t0,[sp,#8*3]
1031 stp $acc0,$acc1,[$tp,#8*0]
1033 stp $acc2,$acc3,[$tp,#8*2]
1035 stp $acc4,$acc5,[$tp,#8*4]
1037 stp $acc6,$acc7,[$tp,#8*6]
1040 adds $acc1,$a1,$t1,lsl#1
1041 extr $t1,$t2,$t1,#63
1044 .Lsqr4x_shift_n_add:
1046 extr $t2,$t3,$t2,#63
1049 ldp $t1,$t2,[$tp,#8*5]
1051 ldp $a1,$a3,[$ap],#8*2
1055 extr $t3,$t0,$t3,#63
1056 stp $acc0,$acc1,[$tp,#8*0]
1058 extr $t0,$t1,$t0,#63
1059 stp $acc2,$acc3,[$tp,#8*2]
1061 ldp $t3,$t0,[$tp,#8*7]
1062 extr $t1,$t2,$t1,#63
1064 extr $t2,$t3,$t2,#63
1066 ldp $t1,$t2,[$tp,#8*9]
1068 ldp $a5,$a7,[$ap],#8*2
1072 stp $acc4,$acc5,[$tp,#8*4]
1073 extr $t3,$t0,$t3,#63
1074 stp $acc6,$acc7,[$tp,#8*6]
1077 extr $t0,$t1,$t0,#63
1079 ldp $t3,$t0,[$tp,#8*3]
1080 extr $t1,$t2,$t1,#63
1081 cbnz $cnt,.Lsqr4x_shift_n_add
1083 my ($np,$np_end)=($ap,$ap_end);
1085 ldp $np,$n0,[x29,#104] // pull np and n0
1088 extr $t2,$t3,$t2,#63
1090 ldp $t1,$t2,[$tp,#8*5]
1093 stp $acc0,$acc1,[$tp,#8*0]
1096 stp $acc2,$acc3,[$tp,#8*2]
1097 extr $t3,$t0,$t3,#63
1099 extr $t0,$t1,$t0,#63
1100 ldp $acc0,$acc1,[sp,#8*0]
1102 extr $t1,$t2,$t1,#63
1103 ldp $a0,$a1,[$np,#8*0]
1105 extr $t2,xzr,$t2,#63
1106 ldp $a2,$a3,[$np,#8*2]
1108 ldp $a4,$a5,[$np,#8*4]
1110 // Reduce by 512 bits per iteration
1111 mul $na0,$n0,$acc0 // t[0]*n0
1112 ldp $a6,$a7,[$np,#8*6]
1113 add $np_end,$np,$num
1114 ldp $acc2,$acc3,[sp,#8*2]
1115 stp $acc4,$acc5,[$tp,#8*4]
1116 ldp $acc4,$acc5,[sp,#8*4]
1117 stp $acc6,$acc7,[$tp,#8*6]
1118 ldp $acc6,$acc7,[sp,#8*6]
1120 mov $topmost,xzr // initial top-most carry
1125 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
1129 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
1131 // (*) adds xzr,$acc0,$t0
1132 subs xzr,$acc0,#1 // (*)
1134 adcs $acc0,$acc1,$t1
1136 adcs $acc1,$acc2,$t2
1138 adcs $acc2,$acc3,$t3
1140 adcs $acc3,$acc4,$t0
1141 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
1142 adcs $acc4,$acc5,$t1
1144 adcs $acc5,$acc6,$t2
1146 adcs $acc6,$acc7,$t3
1149 adds $acc0,$acc0,$t0
1151 adcs $acc1,$acc1,$t1
1153 adcs $acc2,$acc2,$t2
1155 adcs $acc3,$acc3,$t3
1157 mul $na0,$n0,$acc0 // next t[0]*n0
1158 adcs $acc4,$acc4,$t0
1159 adcs $acc5,$acc5,$t1
1160 adcs $acc6,$acc6,$t2
1162 cbnz $cnt,.Lsqr8x_reduction
1164 ldp $t0,$t1,[$tp,#8*0]
1165 ldp $t2,$t3,[$tp,#8*2]
1167 sub $cnt,$np_end,$np // done yet?
1168 adds $acc0,$acc0,$t0
1169 adcs $acc1,$acc1,$t1
1170 ldp $t0,$t1,[$tp,#8*4]
1171 adcs $acc2,$acc2,$t2
1172 adcs $acc3,$acc3,$t3
1173 ldp $t2,$t3,[$tp,#8*6]
1174 adcs $acc4,$acc4,$t0
1175 adcs $acc5,$acc5,$t1
1176 adcs $acc6,$acc6,$t2
1177 adcs $acc7,$acc7,$t3
1178 //adc $carry,xzr,xzr // moved below
1179 cbz $cnt,.Lsqr8x8_post_condition
1181 ldur $n0,[$tp,#-8*8]
1182 ldp $a0,$a1,[$np,#8*0]
1183 ldp $a2,$a3,[$np,#8*2]
1184 ldp $a4,$a5,[$np,#8*4]
1186 ldp $a6,$a7,[$np,#8*6]
1191 adc $carry,xzr,xzr // carry bit, modulo-scheduled
1196 adds $acc0,$acc0,$t0
1198 adcs $acc1,$acc1,$t1
1200 adcs $acc2,$acc2,$t2
1202 adcs $acc3,$acc3,$t3
1204 adcs $acc4,$acc4,$t0
1206 adcs $acc5,$acc5,$t1
1208 adcs $acc6,$acc6,$t2
1210 adcs $acc7,$acc7,$t3
1212 adc $carry,$carry,xzr
1214 adds $acc0,$acc1,$t0
1216 adcs $acc1,$acc2,$t1
1218 adcs $acc2,$acc3,$t2
1220 adcs $acc3,$acc4,$t3
1223 adcs $acc4,$acc5,$t0
1224 adcs $acc5,$acc6,$t1
1225 adcs $acc6,$acc7,$t2
1226 adcs $acc7,$carry,$t3
1227 //adc $carry,xzr,xzr // moved above
1228 cbnz $cnt,.Lsqr8x_tail
1229 // note that carry flag is guaranteed
1230 // to be zero at this point
1231 ldp $a0,$a1,[$tp,#8*0]
1232 sub $cnt,$np_end,$np // done yet?
1233 sub $t2,$np_end,$num // rewinded np
1234 ldp $a2,$a3,[$tp,#8*2]
1235 ldp $a4,$a5,[$tp,#8*4]
1236 ldp $a6,$a7,[$tp,#8*6]
1237 cbz $cnt,.Lsqr8x_tail_break
1239 ldur $n0,[$rp,#-8*8]
1240 adds $acc0,$acc0,$a0
1241 adcs $acc1,$acc1,$a1
1242 ldp $a0,$a1,[$np,#8*0]
1243 adcs $acc2,$acc2,$a2
1244 adcs $acc3,$acc3,$a3
1245 ldp $a2,$a3,[$np,#8*2]
1246 adcs $acc4,$acc4,$a4
1247 adcs $acc5,$acc5,$a5
1248 ldp $a4,$a5,[$np,#8*4]
1249 adcs $acc6,$acc6,$a6
1251 adcs $acc7,$acc7,$a7
1252 ldp $a6,$a7,[$np,#8*6]
1254 //adc $carry,xzr,xzr // moved above
1259 ldr $n0,[x29,#112] // pull n0
1260 add $cnt,$tp,#8*8 // end of current t[num] window
1262 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
1265 ldp $acc0,$acc1,[$rp,#8*0]
1266 adcs $acc2,$acc2,$a2
1267 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
1268 adcs $acc3,$acc3,$a3
1269 ldp $a2,$a3,[$t2,#8*2]
1270 adcs $acc4,$acc4,$a4
1271 adcs $acc5,$acc5,$a5
1272 ldp $a4,$a5,[$t2,#8*4]
1273 adcs $acc6,$acc6,$a6
1274 adcs $acc7,$acc7,$a7
1275 ldp $a6,$a7,[$t2,#8*6]
1277 adc $topmost,xzr,xzr // top-most carry
1279 stp $t0,$t1,[$tp,#8*0]
1280 stp $acc2,$acc3,[$tp,#8*2]
1281 ldp $acc2,$acc3,[$rp,#8*2]
1282 stp $acc4,$acc5,[$tp,#8*4]
1283 ldp $acc4,$acc5,[$rp,#8*4]
1284 cmp $cnt,x29 // did we hit the bottom?
1285 stp $acc6,$acc7,[$tp,#8*6]
1286 mov $tp,$rp // slide the window
1287 ldp $acc6,$acc7,[$rp,#8*6]
1289 b.ne .Lsqr8x_reduction
1291 // Final step. We see if result is larger than modulus, and
1292 // if it is, subtract the modulus. But comparison implies
1293 // subtraction. So we subtract modulus, see if it borrowed,
1294 // and conditionally copy original value.
1295 ldr $rp,[x29,#96] // pull rp
1300 mov $ap_end,$rp // $rp copy
1304 ldp $a0,$a1,[$np,#8*0]
1306 stp $t0,$t1,[$rp,#8*0]
1308 ldp $a2,$a3,[$np,#8*2]
1310 stp $t2,$t3,[$rp,#8*2]
1312 ldp $a4,$a5,[$np,#8*4]
1314 ldp $a6,$a7,[$np,#8*6]
1316 ldp $acc0,$acc1,[$tp,#8*0]
1318 ldp $acc2,$acc3,[$tp,#8*2]
1319 ldp $acc4,$acc5,[$tp,#8*4]
1320 ldp $acc6,$acc7,[$tp,#8*6]
1322 stp $t0,$t1,[$rp,#8*4]
1324 stp $t2,$t3,[$rp,#8*6]
1327 cbnz $cnt,.Lsqr8x_sub
1332 ldp $a0,$a1,[$ap_end,#8*0]
1334 stp $t0,$t1,[$rp,#8*0]
1336 ldp $a2,$a3,[$ap_end,#8*2]
1338 stp $t2,$t3,[$rp,#8*2]
1340 ldp $acc0,$acc1,[$ap,#8*0]
1342 ldp $acc2,$acc3,[$ap,#8*2]
1343 sbcs xzr,$topmost,xzr // did it borrow?
1344 ldr x30,[x29,#8] // pull return address
1345 stp $t0,$t1,[$rp,#8*4]
1346 stp $t2,$t3,[$rp,#8*6]
1351 csel $t0,$acc0,$a0,lo
1352 stp xzr,xzr,[$tp,#8*0]
1353 csel $t1,$acc1,$a1,lo
1354 ldp $a0,$a1,[$ap_end,#8*4]
1355 ldp $acc0,$acc1,[$ap,#8*4]
1356 csel $t2,$acc2,$a2,lo
1357 stp xzr,xzr,[$tp,#8*2]
1359 csel $t3,$acc3,$a3,lo
1360 ldp $a2,$a3,[$ap_end,#8*6]
1361 ldp $acc2,$acc3,[$ap,#8*6]
1363 stp $t0,$t1,[$ap_end,#8*0]
1364 stp $t2,$t3,[$ap_end,#8*2]
1365 add $ap_end,$ap_end,#8*4
1366 stp xzr,xzr,[$ap,#8*0]
1367 stp xzr,xzr,[$ap,#8*2]
1368 cbnz $cnt,.Lsqr4x_cond_copy
1370 csel $t0,$acc0,$a0,lo
1371 stp xzr,xzr,[$tp,#8*0]
1372 csel $t1,$acc1,$a1,lo
1373 stp xzr,xzr,[$tp,#8*2]
1374 csel $t2,$acc2,$a2,lo
1375 csel $t3,$acc3,$a3,lo
1376 stp $t0,$t1,[$ap_end,#8*0]
1377 stp $t2,$t3,[$ap_end,#8*2]
1382 .Lsqr8x8_post_condition:
1384 ldr x30,[x29,#8] // pull return address
1385 // $acc0-7,$carry hold result, $a0-7 hold modulus
1387 ldr $ap,[x29,#96] // pull rp
1389 stp xzr,xzr,[sp,#8*0]
1391 stp xzr,xzr,[sp,#8*2]
1393 stp xzr,xzr,[sp,#8*4]
1395 stp xzr,xzr,[sp,#8*6]
1397 stp xzr,xzr,[sp,#8*8]
1399 stp xzr,xzr,[sp,#8*10]
1401 stp xzr,xzr,[sp,#8*12]
1402 sbcs $carry,$carry,xzr // did it borrow?
1403 stp xzr,xzr,[sp,#8*14]
1405 // $a0-7 hold result-modulus
1406 csel $a0,$acc0,$a0,lo
1407 csel $a1,$acc1,$a1,lo
1408 csel $a2,$acc2,$a2,lo
1409 csel $a3,$acc3,$a3,lo
1410 stp $a0,$a1,[$ap,#8*0]
1411 csel $a4,$acc4,$a4,lo
1412 csel $a5,$acc5,$a5,lo
1413 stp $a2,$a3,[$ap,#8*2]
1414 csel $a6,$acc6,$a6,lo
1415 csel $a7,$acc7,$a7,lo
1416 stp $a4,$a5,[$ap,#8*4]
1417 stp $a6,$a7,[$ap,#8*6]
1420 ldp x19,x20,[x29,#16]
1422 ldp x21,x22,[x29,#32]
1424 ldp x23,x24,[x29,#48]
1425 ldp x25,x26,[x29,#64]
1426 ldp x27,x28,[x29,#80]
1428 .inst 0xd50323bf // autiasp
1430 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1435 ########################################################################
1436 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1437 # x86_64-mont5 module, it's different in sense that it performs
1438 # reduction 256 bits at a time.
1440 my ($a0,$a1,$a2,$a3,
1443 $acc0,$acc1,$acc2,$acc3,$acc4,
1444 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1446 my ($carry,$topmost) = ($rp,"x30");
1449 .type __bn_mul4x_mont,%function
1452 .inst 0xd503233f // paciasp
1453 stp x29,x30,[sp,#-128]!
1455 stp x19,x20,[sp,#16]
1456 stp x21,x22,[sp,#32]
1457 stp x23,x24,[sp,#48]
1458 stp x25,x26,[sp,#64]
1459 stp x27,x28,[sp,#80]
1461 sub $tp,sp,$num,lsl#3
1463 ldr $n0,[$n0] // *n0
1464 sub sp,$tp,#8*4 // alloca
1467 add $ap_end,$ap,$num
1468 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1470 ldr $bi,[$bp,#8*0] // b[0]
1471 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1472 ldp $a2,$a3,[$ap,#8*2]
1478 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1479 ldp $m2,$m3,[$np,#8*2]
1480 adds $np,$np,#8*4 // clear carry bit
1485 .Loop_mul4x_1st_reduction:
1486 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1487 adc $carry,$carry,xzr // modulo-scheduled
1493 adds $acc0,$acc0,$t0
1494 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1495 adcs $acc1,$acc1,$t1
1496 mul $mi,$acc0,$n0 // t[0]*n0
1497 adcs $acc2,$acc2,$t2
1499 adcs $acc3,$acc3,$t3
1503 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1504 adds $acc1,$acc1,$t0
1505 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1506 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1507 adcs $acc2,$acc2,$t1
1509 adcs $acc3,$acc3,$t2
1511 adc $acc4,$acc4,$t3 // can't overflow
1513 // (*) adds xzr,$acc0,$t0
1514 subs xzr,$acc0,#1 // (*)
1515 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1516 adcs $acc0,$acc1,$t1
1518 adcs $acc1,$acc2,$t2
1520 adcs $acc2,$acc3,$t3
1522 adcs $acc3,$acc4,$carry
1524 adds $acc0,$acc0,$t0
1526 adcs $acc1,$acc1,$t1
1527 adcs $acc2,$acc2,$t2
1528 adcs $acc3,$acc3,$t3
1529 //adc $carry,$carry,xzr
1530 cbnz $cnt,.Loop_mul4x_1st_reduction
1532 cbz $t0,.Lmul4x4_post_condition
1534 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1535 ldp $a2,$a3,[$ap,#8*2]
1537 ldr $mi,[sp] // a[0]*n0
1538 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1539 ldp $m2,$m3,[$np,#8*2]
1542 .Loop_mul4x_1st_tail:
1543 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1544 adc $carry,$carry,xzr // modulo-scheduled
1550 adds $acc0,$acc0,$t0
1551 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1552 adcs $acc1,$acc1,$t1
1554 adcs $acc2,$acc2,$t2
1556 adcs $acc3,$acc3,$t3
1559 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1560 adds $acc1,$acc1,$t0
1561 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1562 adcs $acc2,$acc2,$t1
1564 adcs $acc3,$acc3,$t2
1566 adc $acc4,$acc4,$t3 // can't overflow
1568 adds $acc0,$acc0,$t0
1569 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1570 adcs $acc1,$acc1,$t1
1572 adcs $acc2,$acc2,$t2
1574 adcs $acc3,$acc3,$t3
1575 adcs $acc4,$acc4,$carry
1578 ldr $mi,[sp,$cnt] // next t[0]*n0
1579 str $acc0,[$tp],#8 // result!!!
1580 adds $acc0,$acc1,$t0
1581 sub $t0,$ap_end,$ap // done yet?
1582 adcs $acc1,$acc2,$t1
1583 adcs $acc2,$acc3,$t2
1584 adcs $acc3,$acc4,$t3
1585 //adc $carry,$carry,xzr
1586 cbnz $cnt,.Loop_mul4x_1st_tail
1588 sub $t1,$ap_end,$num // rewinded $ap
1589 cbz $t0,.Lmul4x_proceed
1591 ldp $a0,$a1,[$ap,#8*0]
1592 ldp $a2,$a3,[$ap,#8*2]
1594 ldp $m0,$m1,[$np,#8*0]
1595 ldp $m2,$m3,[$np,#8*2]
1597 b .Loop_mul4x_1st_tail
1601 ldr $bi,[$bp,#8*4]! // *++b
1602 adc $topmost,$carry,xzr
1603 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1604 sub $np,$np,$num // rewind np
1605 ldp $a2,$a3,[$t1,#8*2]
1608 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1609 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1610 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1611 ldp $acc2,$acc3,[sp,#8*6]
1613 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1615 ldp $m2,$m3,[$np,#8*2]
1616 adds $np,$np,#8*4 // clear carry bit
1620 .Loop_mul4x_reduction:
1621 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1622 adc $carry,$carry,xzr // modulo-scheduled
1628 adds $acc0,$acc0,$t0
1629 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1630 adcs $acc1,$acc1,$t1
1631 mul $mi,$acc0,$n0 // t[0]*n0
1632 adcs $acc2,$acc2,$t2
1634 adcs $acc3,$acc3,$t3
1638 ldr $bi,[$bp,$cnt] // next b[i]
1639 adds $acc1,$acc1,$t0
1640 // (*) mul $t0,$m0,$mi
1641 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1642 adcs $acc2,$acc2,$t1
1643 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1644 adcs $acc3,$acc3,$t2
1646 adc $acc4,$acc4,$t3 // can't overflow
1648 // (*) adds xzr,$acc0,$t0
1649 subs xzr,$acc0,#1 // (*)
1650 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1651 adcs $acc0,$acc1,$t1
1653 adcs $acc1,$acc2,$t2
1655 adcs $acc2,$acc3,$t3
1657 adcs $acc3,$acc4,$carry
1659 adds $acc0,$acc0,$t0
1660 adcs $acc1,$acc1,$t1
1661 adcs $acc2,$acc2,$t2
1662 adcs $acc3,$acc3,$t3
1663 //adc $carry,$carry,xzr
1664 cbnz $cnt,.Loop_mul4x_reduction
1666 adc $carry,$carry,xzr
1667 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1668 ldp $t2,$t3,[$tp,#8*6]
1669 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1670 ldp $a2,$a3,[$ap,#8*2]
1672 adds $acc0,$acc0,$t0
1673 adcs $acc1,$acc1,$t1
1674 adcs $acc2,$acc2,$t2
1675 adcs $acc3,$acc3,$t3
1676 //adc $carry,$carry,xzr
1678 ldr $mi,[sp] // t[0]*n0
1679 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1680 ldp $m2,$m3,[$np,#8*2]
1685 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1686 adc $carry,$carry,xzr // modulo-scheduled
1692 adds $acc0,$acc0,$t0
1693 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1694 adcs $acc1,$acc1,$t1
1696 adcs $acc2,$acc2,$t2
1698 adcs $acc3,$acc3,$t3
1701 ldr $bi,[$bp,$cnt] // next b[i]
1702 adds $acc1,$acc1,$t0
1703 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1704 adcs $acc2,$acc2,$t1
1706 adcs $acc3,$acc3,$t2
1708 adc $acc4,$acc4,$t3 // can't overflow
1710 adds $acc0,$acc0,$t0
1711 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1712 adcs $acc1,$acc1,$t1
1714 adcs $acc2,$acc2,$t2
1716 adcs $acc3,$acc3,$t3
1718 adcs $acc4,$acc4,$carry
1719 ldr $mi,[sp,$cnt] // next a[0]*n0
1721 str $acc0,[$tp],#8 // result!!!
1722 adds $acc0,$acc1,$t0
1723 sub $t0,$ap_end,$ap // done yet?
1724 adcs $acc1,$acc2,$t1
1725 adcs $acc2,$acc3,$t2
1726 adcs $acc3,$acc4,$t3
1727 //adc $carry,$carry,xzr
1728 cbnz $cnt,.Loop_mul4x_tail
1730 sub $t1,$np,$num // rewinded np?
1731 adc $carry,$carry,xzr
1732 cbz $t0,.Loop_mul4x_break
1734 ldp $t0,$t1,[$tp,#8*4]
1735 ldp $t2,$t3,[$tp,#8*6]
1736 ldp $a0,$a1,[$ap,#8*0]
1737 ldp $a2,$a3,[$ap,#8*2]
1739 adds $acc0,$acc0,$t0
1740 adcs $acc1,$acc1,$t1
1741 adcs $acc2,$acc2,$t2
1742 adcs $acc3,$acc3,$t3
1743 //adc $carry,$carry,xzr
1744 ldp $m0,$m1,[$np,#8*0]
1745 ldp $m2,$m3,[$np,#8*2]
1751 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1752 adds $acc0,$acc0,$topmost
1753 add $bp,$bp,#8*4 // bp++
1754 adcs $acc1,$acc1,xzr
1755 sub $ap,$ap,$num // rewind ap
1756 adcs $acc2,$acc2,xzr
1757 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1758 adcs $acc3,$acc3,xzr
1759 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1760 adc $topmost,$carry,xzr
1761 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1762 cmp $bp,$t3 // done yet?
1763 ldp $acc2,$acc3,[sp,#8*6]
1764 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1765 ldp $m2,$m3,[$t1,#8*2]
1770 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1771 ldp $a2,$a3,[$ap,#8*2]
1772 adds $ap,$ap,#8*4 // clear carry bit
1775 b .Loop_mul4x_reduction
1779 // Final step. We see if result is larger than modulus, and
1780 // if it is, subtract the modulus. But comparison implies
1781 // subtraction. So we subtract modulus, see if it borrowed,
1782 // and conditionally copy original value.
1784 mov $ap_end,$t2 // $rp copy
1792 ldp $m0,$m1,[$np,#8*0]
1794 ldp $acc0,$acc1,[$tp,#8*0]
1796 ldp $m2,$m3,[$np,#8*2]
1798 ldp $acc2,$acc3,[$tp,#8*2]
1800 stp $t0,$t1,[$rp,#8*0]
1802 stp $t2,$t3,[$rp,#8*2]
1805 cbnz $cnt,.Lmul4x_sub
1810 ldp $a0,$a1,[$ap_end,#8*0]
1812 stp $t0,$t1,[$rp,#8*0]
1813 ldp $a2,$a3,[$ap_end,#8*2]
1814 stp $t2,$t3,[$rp,#8*2]
1815 ldp $acc0,$acc1,[$ap,#8*0]
1816 ldp $acc2,$acc3,[$ap,#8*2]
1817 sbcs xzr,$topmost,xzr // did it borrow?
1818 ldr x30,[x29,#8] // pull return address
1823 csel $t0,$acc0,$a0,lo
1824 stp xzr,xzr,[$tp,#8*0]
1825 csel $t1,$acc1,$a1,lo
1826 ldp $a0,$a1,[$ap_end,#8*4]
1827 ldp $acc0,$acc1,[$ap,#8*4]
1828 csel $t2,$acc2,$a2,lo
1829 stp xzr,xzr,[$tp,#8*2]
1831 csel $t3,$acc3,$a3,lo
1832 ldp $a2,$a3,[$ap_end,#8*6]
1833 ldp $acc2,$acc3,[$ap,#8*6]
1835 stp $t0,$t1,[$ap_end,#8*0]
1836 stp $t2,$t3,[$ap_end,#8*2]
1837 add $ap_end,$ap_end,#8*4
1838 cbnz $cnt,.Lmul4x_cond_copy
1840 csel $t0,$acc0,$a0,lo
1841 stp xzr,xzr,[$tp,#8*0]
1842 csel $t1,$acc1,$a1,lo
1843 stp xzr,xzr,[$tp,#8*2]
1844 csel $t2,$acc2,$a2,lo
1845 stp xzr,xzr,[$tp,#8*3]
1846 csel $t3,$acc3,$a3,lo
1847 stp xzr,xzr,[$tp,#8*4]
1848 stp $t0,$t1,[$ap_end,#8*0]
1849 stp $t2,$t3,[$ap_end,#8*2]
1854 .Lmul4x4_post_condition:
1855 adc $carry,$carry,xzr
1856 ldr $ap,[x29,#96] // pull rp
1857 // $acc0-3,$carry hold result, $m0-7 hold modulus
1859 ldr x30,[x29,#8] // pull return address
1861 stp xzr,xzr,[sp,#8*0]
1863 stp xzr,xzr,[sp,#8*2]
1865 stp xzr,xzr,[sp,#8*4]
1866 sbcs xzr,$carry,xzr // did it borrow?
1867 stp xzr,xzr,[sp,#8*6]
1869 // $a0-3 hold result-modulus
1870 csel $a0,$acc0,$a0,lo
1871 csel $a1,$acc1,$a1,lo
1872 csel $a2,$acc2,$a2,lo
1873 csel $a3,$acc3,$a3,lo
1874 stp $a0,$a1,[$ap,#8*0]
1875 stp $a2,$a3,[$ap,#8*2]
1878 ldp x19,x20,[x29,#16]
1880 ldp x21,x22,[x29,#32]
1882 ldp x23,x24,[x29,#48]
1883 ldp x25,x26,[x29,#64]
1884 ldp x27,x28,[x29,#80]
1886 .inst 0xd50323bf // autiasp
1888 .size __bn_mul4x_mont,.-__bn_mul4x_mont
1892 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1898 close STDOUT or die "error closing STDOUT: $!";