2 # Copyright 2015-2019 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
20 # work. While it does improve RSA sign performance by 20-30% (less for
21 # longer keys) on most processors, for some reason RSA2048 is not
22 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23 # instruction issue rate is limited on processor in question, meaning
24 # that dedicated squaring procedure is a must. Well, actually all
25 # contemporary AArch64 processors seem to have limited multiplication
26 # issue rate, i.e. they can't issue multiplication every cycle, which
27 # explains moderate improvement coefficients in comparison to
28 # compiler-generated code. Recall that compiler is instructed to use
29 # umulh and therefore uses same amount of multiplication instructions
30 # to do the job. Assembly's edge is to minimize number of "collateral"
31 # instructions and of course instruction scheduling.
35 # Squaring procedure that handles lengths divisible by 8 improves
36 # RSA/DSA performance by 25-40-60% depending on processor and key
37 # length. Overall improvement coefficients are always positive in
38 # comparison to compiler-generated code. On Cortex-A57 improvement
39 # is still modest on longest key lengths, while others exhibit e.g.
40 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41 # on Cortex-A57 and ~60-100% faster on others.
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49 die "can't locate arm-xlate.pl";
51 open OUT,"| \"$^X\" $xlate $flavour $output";
54 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
55 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
59 $rp="x0"; # BN_ULONG *rp,
60 $ap="x1"; # const BN_ULONG *ap,
61 $bp="x2"; # const BN_ULONG *bp,
62 $np="x3"; # const BN_ULONG *np,
63 $n0="x4"; # const BN_ULONG *n0,
64 $num="x5"; # int num);
70 .type bn_mul_mont,%function
78 stp x29,x30,[sp,#-64]!
84 ldr $m0,[$bp],#8 // bp[0]
86 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
89 and $tp,$tp,#-16 // ABI says so
90 ldp $hi1,$nj,[$np],#16 // np[0..1]
92 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
93 sub $j,$num,#16 // j=num-2
95 mul $alo,$aj,$m0 // ap[1]*bp[0]
98 mul $m1,$lo0,$n0 // "tp[0]"*n0
101 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
103 mul $nlo,$nj,$m1 // np[1]*m1
104 // (*) adds $lo1,$lo1,$lo0 // discarded
105 // (*) As for removal of first multiplication and addition
106 // instructions. The outcome of first addition is
107 // guaranteed to be zero, which leaves two computationally
108 // significant outcomes: it either carries or not. Then
109 // question is when does it carry? Is there alternative
110 // way to deduce it? If you follow operations, you can
111 // observe that condition for carry is quite simple:
112 // $lo0 being non-zero. So that carry can be calculated
113 // by adding -1 to $lo0. That's what next instruction does.
114 subs xzr,$lo0,#1 // (*)
127 mul $alo,$aj,$m0 // ap[j]*bp[0]
132 mul $nlo,$nj,$m1 // np[j]*m1
135 str $lo1,[$tp],#8 // tp[j-1]
140 sub $ap,$ap,$num // rewind $ap
144 sub $np,$np,$num // rewind $np
148 sub $i,$num,#8 // i=num-1
151 adc $ovf,xzr,xzr // upmost overflow bit
155 ldr $m0,[$bp],#8 // bp[i]
156 ldp $hi0,$aj,[$ap],#16
157 ldr $tj,[sp] // tp[0]
160 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
161 sub $j,$num,#16 // j=num-2
163 ldp $hi1,$nj,[$np],#16
164 mul $alo,$aj,$m0 // ap[1]*bp[i]
172 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
174 mul $nlo,$nj,$m1 // np[1]*m1
175 // (*) adds $lo1,$lo1,$lo0
176 subs xzr,$lo0,#1 // (*)
183 ldr $tj,[$tp],#8 // tp[j]
192 mul $alo,$aj,$m0 // ap[j]*bp[i]
197 mul $nlo,$nj,$m1 // np[j]*m1
200 str $lo1,[$tp,#-16] // tp[j-1]
204 ldr $tj,[$tp],#8 // tp[j]
207 sub $ap,$ap,$num // rewind $ap
211 sub $np,$np,$num // rewind $np
220 adc $ovf,$ovf,xzr // upmost overflow bit
221 stp $lo1,$hi1,[$tp,#-16]
225 // Final step. We see if result is larger than modulus, and
226 // if it is, subtract the modulus. But comparison implies
227 // subtraction. So we subtract modulus, see if it borrowed,
228 // and conditionally copy original value.
229 ldr $tj,[sp] // tp[0]
231 ldr $nj,[$np],#8 // np[0]
232 subs $j,$num,#8 // j=num-1 and clear borrow
235 sbcs $aj,$tj,$nj // tp[j]-np[j]
239 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
243 sbcs $ovf,$ovf,xzr // did it borrow?
244 str $aj,[$ap],#8 // rp[num-1]
246 ldr $tj,[sp] // tp[0]
248 ldr $aj,[$rp],#8 // rp[0]
249 sub $num,$num,#8 // num--
252 sub $num,$num,#8 // num--
253 csel $nj,$tj,$aj,lo // did it borrow?
256 str xzr,[$tp,#-16] // wipe tp
258 cbnz $num,.Lcond_copy
261 str xzr,[$tp,#-8] // wipe tp
264 ldp x19,x20,[x29,#16]
266 ldp x21,x22,[x29,#32]
268 ldp x23,x24,[x29,#48]
271 .size bn_mul_mont,.-bn_mul_mont
274 ########################################################################
275 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
277 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
278 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
279 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
280 my ($cnt,$carry,$topmost)=("x27","x28","x30");
281 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
284 .type __bn_sqr8x_mont,%function
290 .inst 0xd503233f // paciasp
291 stp x29,x30,[sp,#-128]!
298 stp $rp,$np,[sp,#96] // offload rp and np
300 ldp $a0,$a1,[$ap,#8*0]
301 ldp $a2,$a3,[$ap,#8*2]
302 ldp $a4,$a5,[$ap,#8*4]
303 ldp $a6,$a7,[$ap,#8*6]
305 sub $tp,sp,$num,lsl#4
314 stp xzr,xzr,[$tp,#8*0]
315 stp xzr,xzr,[$tp,#8*2]
316 stp xzr,xzr,[$tp,#8*4]
317 stp xzr,xzr,[$tp,#8*6]
319 stp xzr,xzr,[$tp,#8*8]
320 stp xzr,xzr,[$tp,#8*10]
321 stp xzr,xzr,[$tp,#8*12]
322 stp xzr,xzr,[$tp,#8*14]
324 cbnz $cnt,.Lsqr8x_zero
337 str $n0,[x29,#112] // offload n0
339 // Multiply everything but a[i]*a[i]
371 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
375 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
382 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
389 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
390 adc $acc0,xzr,xzr // t[8]
391 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
398 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
411 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
418 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
419 adc $acc1,xzr,xzr // t[9]
425 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
436 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
443 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
444 adc $acc2,xzr,xzr // t[10]
448 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
457 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
464 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
465 adc $acc3,xzr,xzr // t[11]
467 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
474 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
480 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
481 adc $acc4,xzr,xzr // t[12]
485 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
490 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
492 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
493 adc $acc5,xzr,xzr // t[13]
495 sub $cnt,$ap_end,$ap // done yet?
499 sub $t0,$ap_end,$num // rewinded ap
500 adc $acc6,xzr,xzr // t[14]
503 cbz $cnt,.Lsqr8x_outer_break
506 ldp $a0,$a1,[$tp,#8*0]
507 ldp $a2,$a3,[$tp,#8*2]
508 ldp $a4,$a5,[$tp,#8*4]
509 ldp $a6,$a7,[$tp,#8*6]
512 ldp $a0,$a1,[$ap,#8*0]
515 ldp $a2,$a3,[$ap,#8*2]
518 ldp $a4,$a5,[$ap,#8*4]
522 ldp $a6,$a7,[$ap,#8*6]
524 //adc $carry,xzr,xzr // moved below
536 // a[f]a[1]........................
538 // a[f]a[2]........................
540 // a[f]a[3]........................
542 // a[f]a[4]........................
544 // a[f]a[5]........................
546 // a[f]a[6]........................
548 // a[f]a[7]........................
551 adc $carry,xzr,xzr // carry bit, modulo-scheduled
572 adc $carry,$carry,xzr
586 adcs $acc7,$carry,$t3
587 //adc $carry,xzr,xzr // moved above
588 cbnz $cnt,.Lsqr8x_mul
589 // note that carry flag is guaranteed
590 // to be zero at this point
591 cmp $ap,$ap_end // done yet?
594 ldp $a0,$a1,[$tp,#8*0]
595 ldp $a2,$a3,[$tp,#8*2]
596 ldp $a4,$a5,[$tp,#8*4]
597 ldp $a6,$a7,[$tp,#8*6]
601 ldp $a0,$a1,[$ap,#8*0]
604 ldp $a2,$a3,[$ap,#8*2]
607 ldp $a4,$a5,[$ap,#8*4]
611 ldp $a6,$a7,[$ap,#8*6]
613 //adc $carry,xzr,xzr // moved above
618 ldp $a0,$a1,[$rp,#8*0]
620 ldp $a2,$a3,[$rp,#8*2]
621 sub $t0,$ap_end,$ap // is it last iteration?
622 ldp $a4,$a5,[$rp,#8*4]
624 ldp $a6,$a7,[$rp,#8*6]
625 cbz $t0,.Lsqr8x_outer_loop
627 stp $acc0,$acc1,[$tp,#8*0]
628 ldp $acc0,$acc1,[$t1,#8*0]
629 stp $acc2,$acc3,[$tp,#8*2]
630 ldp $acc2,$acc3,[$t1,#8*2]
631 stp $acc4,$acc5,[$tp,#8*4]
632 ldp $acc4,$acc5,[$t1,#8*4]
633 stp $acc6,$acc7,[$tp,#8*6]
635 ldp $acc6,$acc7,[$t1,#8*6]
640 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
641 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
642 ldp $t1,$t2,[sp,#8*1]
643 ldp $a5,$a7,[$t0,#8*2]
645 ldp $t3,$t0,[sp,#8*3]
647 stp $acc0,$acc1,[$tp,#8*0]
649 stp $acc2,$acc3,[$tp,#8*2]
651 stp $acc4,$acc5,[$tp,#8*4]
653 stp $acc6,$acc7,[$tp,#8*6]
656 adds $acc1,$a1,$t1,lsl#1
665 ldp $t1,$t2,[$tp,#8*5]
667 ldp $a1,$a3,[$ap],#8*2
672 stp $acc0,$acc1,[$tp,#8*0]
675 stp $acc2,$acc3,[$tp,#8*2]
677 ldp $t3,$t0,[$tp,#8*7]
682 ldp $t1,$t2,[$tp,#8*9]
684 ldp $a5,$a7,[$ap],#8*2
688 stp $acc4,$acc5,[$tp,#8*4]
690 stp $acc6,$acc7,[$tp,#8*6]
695 ldp $t3,$t0,[$tp,#8*3]
697 cbnz $cnt,.Lsqr4x_shift_n_add
699 my ($np,$np_end)=($ap,$ap_end);
701 ldp $np,$n0,[x29,#104] // pull np and n0
706 ldp $t1,$t2,[$tp,#8*5]
709 stp $acc0,$acc1,[$tp,#8*0]
712 stp $acc2,$acc3,[$tp,#8*2]
716 ldp $acc0,$acc1,[sp,#8*0]
719 ldp $a0,$a1,[$np,#8*0]
722 ldp $a2,$a3,[$np,#8*2]
724 ldp $a4,$a5,[$np,#8*4]
726 // Reduce by 512 bits per iteration
727 mul $na0,$n0,$acc0 // t[0]*n0
728 ldp $a6,$a7,[$np,#8*6]
730 ldp $acc2,$acc3,[sp,#8*2]
731 stp $acc4,$acc5,[$tp,#8*4]
732 ldp $acc4,$acc5,[sp,#8*4]
733 stp $acc6,$acc7,[$tp,#8*6]
734 ldp $acc6,$acc7,[sp,#8*6]
736 mov $topmost,xzr // initial top-most carry
741 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
745 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
747 // (*) adds xzr,$acc0,$t0
748 subs xzr,$acc0,#1 // (*)
757 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
773 mul $na0,$n0,$acc0 // next t[0]*n0
778 cbnz $cnt,.Lsqr8x_reduction
780 ldp $t0,$t1,[$tp,#8*0]
781 ldp $t2,$t3,[$tp,#8*2]
783 sub $cnt,$np_end,$np // done yet?
786 ldp $t0,$t1,[$tp,#8*4]
789 ldp $t2,$t3,[$tp,#8*6]
794 //adc $carry,xzr,xzr // moved below
795 cbz $cnt,.Lsqr8x8_post_condition
798 ldp $a0,$a1,[$np,#8*0]
799 ldp $a2,$a3,[$np,#8*2]
800 ldp $a4,$a5,[$np,#8*4]
802 ldp $a6,$a7,[$np,#8*6]
807 adc $carry,xzr,xzr // carry bit, modulo-scheduled
828 adc $carry,$carry,xzr
842 adcs $acc7,$carry,$t3
843 //adc $carry,xzr,xzr // moved above
844 cbnz $cnt,.Lsqr8x_tail
845 // note that carry flag is guaranteed
846 // to be zero at this point
847 ldp $a0,$a1,[$tp,#8*0]
848 sub $cnt,$np_end,$np // done yet?
849 sub $t2,$np_end,$num // rewinded np
850 ldp $a2,$a3,[$tp,#8*2]
851 ldp $a4,$a5,[$tp,#8*4]
852 ldp $a6,$a7,[$tp,#8*6]
853 cbz $cnt,.Lsqr8x_tail_break
858 ldp $a0,$a1,[$np,#8*0]
861 ldp $a2,$a3,[$np,#8*2]
864 ldp $a4,$a5,[$np,#8*4]
868 ldp $a6,$a7,[$np,#8*6]
870 //adc $carry,xzr,xzr // moved above
875 ldr $n0,[x29,#112] // pull n0
876 add $cnt,$tp,#8*8 // end of current t[num] window
878 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
881 ldp $acc0,$acc1,[$rp,#8*0]
883 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
885 ldp $a2,$a3,[$t2,#8*2]
888 ldp $a4,$a5,[$t2,#8*4]
891 ldp $a6,$a7,[$t2,#8*6]
893 adc $topmost,xzr,xzr // top-most carry
895 stp $t0,$t1,[$tp,#8*0]
896 stp $acc2,$acc3,[$tp,#8*2]
897 ldp $acc2,$acc3,[$rp,#8*2]
898 stp $acc4,$acc5,[$tp,#8*4]
899 ldp $acc4,$acc5,[$rp,#8*4]
900 cmp $cnt,x29 // did we hit the bottom?
901 stp $acc6,$acc7,[$tp,#8*6]
902 mov $tp,$rp // slide the window
903 ldp $acc6,$acc7,[$rp,#8*6]
905 b.ne .Lsqr8x_reduction
907 // Final step. We see if result is larger than modulus, and
908 // if it is, subtract the modulus. But comparison implies
909 // subtraction. So we subtract modulus, see if it borrowed,
910 // and conditionally copy original value.
911 ldr $rp,[x29,#96] // pull rp
916 mov $ap_end,$rp // $rp copy
920 ldp $a0,$a1,[$np,#8*0]
922 stp $t0,$t1,[$rp,#8*0]
924 ldp $a2,$a3,[$np,#8*2]
926 stp $t2,$t3,[$rp,#8*2]
928 ldp $a4,$a5,[$np,#8*4]
930 ldp $a6,$a7,[$np,#8*6]
932 ldp $acc0,$acc1,[$tp,#8*0]
934 ldp $acc2,$acc3,[$tp,#8*2]
935 ldp $acc4,$acc5,[$tp,#8*4]
936 ldp $acc6,$acc7,[$tp,#8*6]
938 stp $t0,$t1,[$rp,#8*4]
940 stp $t2,$t3,[$rp,#8*6]
943 cbnz $cnt,.Lsqr8x_sub
948 ldp $a0,$a1,[$ap_end,#8*0]
950 stp $t0,$t1,[$rp,#8*0]
952 ldp $a2,$a3,[$ap_end,#8*2]
954 stp $t2,$t3,[$rp,#8*2]
956 ldp $acc0,$acc1,[$ap,#8*0]
958 ldp $acc2,$acc3,[$ap,#8*2]
959 sbcs xzr,$topmost,xzr // did it borrow?
960 ldr x30,[x29,#8] // pull return address
961 stp $t0,$t1,[$rp,#8*4]
962 stp $t2,$t3,[$rp,#8*6]
967 csel $t0,$acc0,$a0,lo
968 stp xzr,xzr,[$tp,#8*0]
969 csel $t1,$acc1,$a1,lo
970 ldp $a0,$a1,[$ap_end,#8*4]
971 ldp $acc0,$acc1,[$ap,#8*4]
972 csel $t2,$acc2,$a2,lo
973 stp xzr,xzr,[$tp,#8*2]
975 csel $t3,$acc3,$a3,lo
976 ldp $a2,$a3,[$ap_end,#8*6]
977 ldp $acc2,$acc3,[$ap,#8*6]
979 stp $t0,$t1,[$ap_end,#8*0]
980 stp $t2,$t3,[$ap_end,#8*2]
981 add $ap_end,$ap_end,#8*4
982 stp xzr,xzr,[$ap,#8*0]
983 stp xzr,xzr,[$ap,#8*2]
984 cbnz $cnt,.Lsqr4x_cond_copy
986 csel $t0,$acc0,$a0,lo
987 stp xzr,xzr,[$tp,#8*0]
988 csel $t1,$acc1,$a1,lo
989 stp xzr,xzr,[$tp,#8*2]
990 csel $t2,$acc2,$a2,lo
991 csel $t3,$acc3,$a3,lo
992 stp $t0,$t1,[$ap_end,#8*0]
993 stp $t2,$t3,[$ap_end,#8*2]
998 .Lsqr8x8_post_condition:
1000 ldr x30,[x29,#8] // pull return address
1001 // $acc0-7,$carry hold result, $a0-7 hold modulus
1003 ldr $ap,[x29,#96] // pull rp
1005 stp xzr,xzr,[sp,#8*0]
1007 stp xzr,xzr,[sp,#8*2]
1009 stp xzr,xzr,[sp,#8*4]
1011 stp xzr,xzr,[sp,#8*6]
1013 stp xzr,xzr,[sp,#8*8]
1015 stp xzr,xzr,[sp,#8*10]
1017 stp xzr,xzr,[sp,#8*12]
1018 sbcs $carry,$carry,xzr // did it borrow?
1019 stp xzr,xzr,[sp,#8*14]
1021 // $a0-7 hold result-modulus
1022 csel $a0,$acc0,$a0,lo
1023 csel $a1,$acc1,$a1,lo
1024 csel $a2,$acc2,$a2,lo
1025 csel $a3,$acc3,$a3,lo
1026 stp $a0,$a1,[$ap,#8*0]
1027 csel $a4,$acc4,$a4,lo
1028 csel $a5,$acc5,$a5,lo
1029 stp $a2,$a3,[$ap,#8*2]
1030 csel $a6,$acc6,$a6,lo
1031 csel $a7,$acc7,$a7,lo
1032 stp $a4,$a5,[$ap,#8*4]
1033 stp $a6,$a7,[$ap,#8*6]
1036 ldp x19,x20,[x29,#16]
1038 ldp x21,x22,[x29,#32]
1040 ldp x23,x24,[x29,#48]
1041 ldp x25,x26,[x29,#64]
1042 ldp x27,x28,[x29,#80]
1044 .inst 0xd50323bf // autiasp
1046 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1051 ########################################################################
1052 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1053 # x86_64-mont5 module, it's different in sense that it performs
1054 # reduction 256 bits at a time.
1056 my ($a0,$a1,$a2,$a3,
1059 $acc0,$acc1,$acc2,$acc3,$acc4,
1060 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1062 my ($carry,$topmost) = ($rp,"x30");
1065 .type __bn_mul4x_mont,%function
1068 .inst 0xd503233f // paciasp
1069 stp x29,x30,[sp,#-128]!
1071 stp x19,x20,[sp,#16]
1072 stp x21,x22,[sp,#32]
1073 stp x23,x24,[sp,#48]
1074 stp x25,x26,[sp,#64]
1075 stp x27,x28,[sp,#80]
1077 sub $tp,sp,$num,lsl#3
1079 ldr $n0,[$n0] // *n0
1080 sub sp,$tp,#8*4 // alloca
1083 add $ap_end,$ap,$num
1084 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1086 ldr $bi,[$bp,#8*0] // b[0]
1087 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1088 ldp $a2,$a3,[$ap,#8*2]
1094 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1095 ldp $m2,$m3,[$np,#8*2]
1096 adds $np,$np,#8*4 // clear carry bit
1101 .Loop_mul4x_1st_reduction:
1102 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1103 adc $carry,$carry,xzr // modulo-scheduled
1109 adds $acc0,$acc0,$t0
1110 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1111 adcs $acc1,$acc1,$t1
1112 mul $mi,$acc0,$n0 // t[0]*n0
1113 adcs $acc2,$acc2,$t2
1115 adcs $acc3,$acc3,$t3
1119 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1120 adds $acc1,$acc1,$t0
1121 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1122 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1123 adcs $acc2,$acc2,$t1
1125 adcs $acc3,$acc3,$t2
1127 adc $acc4,$acc4,$t3 // can't overflow
1129 // (*) adds xzr,$acc0,$t0
1130 subs xzr,$acc0,#1 // (*)
1131 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1132 adcs $acc0,$acc1,$t1
1134 adcs $acc1,$acc2,$t2
1136 adcs $acc2,$acc3,$t3
1138 adcs $acc3,$acc4,$carry
1140 adds $acc0,$acc0,$t0
1142 adcs $acc1,$acc1,$t1
1143 adcs $acc2,$acc2,$t2
1144 adcs $acc3,$acc3,$t3
1145 //adc $carry,$carry,xzr
1146 cbnz $cnt,.Loop_mul4x_1st_reduction
1148 cbz $t0,.Lmul4x4_post_condition
1150 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1151 ldp $a2,$a3,[$ap,#8*2]
1153 ldr $mi,[sp] // a[0]*n0
1154 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1155 ldp $m2,$m3,[$np,#8*2]
1158 .Loop_mul4x_1st_tail:
1159 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1160 adc $carry,$carry,xzr // modulo-scheduled
1166 adds $acc0,$acc0,$t0
1167 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1168 adcs $acc1,$acc1,$t1
1170 adcs $acc2,$acc2,$t2
1172 adcs $acc3,$acc3,$t3
1175 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1176 adds $acc1,$acc1,$t0
1177 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1178 adcs $acc2,$acc2,$t1
1180 adcs $acc3,$acc3,$t2
1182 adc $acc4,$acc4,$t3 // can't overflow
1184 adds $acc0,$acc0,$t0
1185 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1186 adcs $acc1,$acc1,$t1
1188 adcs $acc2,$acc2,$t2
1190 adcs $acc3,$acc3,$t3
1191 adcs $acc4,$acc4,$carry
1194 ldr $mi,[sp,$cnt] // next t[0]*n0
1195 str $acc0,[$tp],#8 // result!!!
1196 adds $acc0,$acc1,$t0
1197 sub $t0,$ap_end,$ap // done yet?
1198 adcs $acc1,$acc2,$t1
1199 adcs $acc2,$acc3,$t2
1200 adcs $acc3,$acc4,$t3
1201 //adc $carry,$carry,xzr
1202 cbnz $cnt,.Loop_mul4x_1st_tail
1204 sub $t1,$ap_end,$num // rewinded $ap
1205 cbz $t0,.Lmul4x_proceed
1207 ldp $a0,$a1,[$ap,#8*0]
1208 ldp $a2,$a3,[$ap,#8*2]
1210 ldp $m0,$m1,[$np,#8*0]
1211 ldp $m2,$m3,[$np,#8*2]
1213 b .Loop_mul4x_1st_tail
1217 ldr $bi,[$bp,#8*4]! // *++b
1218 adc $topmost,$carry,xzr
1219 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1220 sub $np,$np,$num // rewind np
1221 ldp $a2,$a3,[$t1,#8*2]
1224 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1225 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1226 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1227 ldp $acc2,$acc3,[sp,#8*6]
1229 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1231 ldp $m2,$m3,[$np,#8*2]
1232 adds $np,$np,#8*4 // clear carry bit
1236 .Loop_mul4x_reduction:
1237 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1238 adc $carry,$carry,xzr // modulo-scheduled
1244 adds $acc0,$acc0,$t0
1245 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1246 adcs $acc1,$acc1,$t1
1247 mul $mi,$acc0,$n0 // t[0]*n0
1248 adcs $acc2,$acc2,$t2
1250 adcs $acc3,$acc3,$t3
1254 ldr $bi,[$bp,$cnt] // next b[i]
1255 adds $acc1,$acc1,$t0
1256 // (*) mul $t0,$m0,$mi
1257 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1258 adcs $acc2,$acc2,$t1
1259 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1260 adcs $acc3,$acc3,$t2
1262 adc $acc4,$acc4,$t3 // can't overflow
1264 // (*) adds xzr,$acc0,$t0
1265 subs xzr,$acc0,#1 // (*)
1266 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1267 adcs $acc0,$acc1,$t1
1269 adcs $acc1,$acc2,$t2
1271 adcs $acc2,$acc3,$t3
1273 adcs $acc3,$acc4,$carry
1275 adds $acc0,$acc0,$t0
1276 adcs $acc1,$acc1,$t1
1277 adcs $acc2,$acc2,$t2
1278 adcs $acc3,$acc3,$t3
1279 //adc $carry,$carry,xzr
1280 cbnz $cnt,.Loop_mul4x_reduction
1282 adc $carry,$carry,xzr
1283 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1284 ldp $t2,$t3,[$tp,#8*6]
1285 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1286 ldp $a2,$a3,[$ap,#8*2]
1288 adds $acc0,$acc0,$t0
1289 adcs $acc1,$acc1,$t1
1290 adcs $acc2,$acc2,$t2
1291 adcs $acc3,$acc3,$t3
1292 //adc $carry,$carry,xzr
1294 ldr $mi,[sp] // t[0]*n0
1295 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1296 ldp $m2,$m3,[$np,#8*2]
1301 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1302 adc $carry,$carry,xzr // modulo-scheduled
1308 adds $acc0,$acc0,$t0
1309 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1310 adcs $acc1,$acc1,$t1
1312 adcs $acc2,$acc2,$t2
1314 adcs $acc3,$acc3,$t3
1317 ldr $bi,[$bp,$cnt] // next b[i]
1318 adds $acc1,$acc1,$t0
1319 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1320 adcs $acc2,$acc2,$t1
1322 adcs $acc3,$acc3,$t2
1324 adc $acc4,$acc4,$t3 // can't overflow
1326 adds $acc0,$acc0,$t0
1327 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1328 adcs $acc1,$acc1,$t1
1330 adcs $acc2,$acc2,$t2
1332 adcs $acc3,$acc3,$t3
1334 adcs $acc4,$acc4,$carry
1335 ldr $mi,[sp,$cnt] // next a[0]*n0
1337 str $acc0,[$tp],#8 // result!!!
1338 adds $acc0,$acc1,$t0
1339 sub $t0,$ap_end,$ap // done yet?
1340 adcs $acc1,$acc2,$t1
1341 adcs $acc2,$acc3,$t2
1342 adcs $acc3,$acc4,$t3
1343 //adc $carry,$carry,xzr
1344 cbnz $cnt,.Loop_mul4x_tail
1346 sub $t1,$np,$num // rewinded np?
1347 adc $carry,$carry,xzr
1348 cbz $t0,.Loop_mul4x_break
1350 ldp $t0,$t1,[$tp,#8*4]
1351 ldp $t2,$t3,[$tp,#8*6]
1352 ldp $a0,$a1,[$ap,#8*0]
1353 ldp $a2,$a3,[$ap,#8*2]
1355 adds $acc0,$acc0,$t0
1356 adcs $acc1,$acc1,$t1
1357 adcs $acc2,$acc2,$t2
1358 adcs $acc3,$acc3,$t3
1359 //adc $carry,$carry,xzr
1360 ldp $m0,$m1,[$np,#8*0]
1361 ldp $m2,$m3,[$np,#8*2]
1367 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1368 adds $acc0,$acc0,$topmost
1369 add $bp,$bp,#8*4 // bp++
1370 adcs $acc1,$acc1,xzr
1371 sub $ap,$ap,$num // rewind ap
1372 adcs $acc2,$acc2,xzr
1373 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1374 adcs $acc3,$acc3,xzr
1375 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1376 adc $topmost,$carry,xzr
1377 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1378 cmp $bp,$t3 // done yet?
1379 ldp $acc2,$acc3,[sp,#8*6]
1380 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1381 ldp $m2,$m3,[$t1,#8*2]
1386 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1387 ldp $a2,$a3,[$ap,#8*2]
1388 adds $ap,$ap,#8*4 // clear carry bit
1391 b .Loop_mul4x_reduction
1395 // Final step. We see if result is larger than modulus, and
1396 // if it is, subtract the modulus. But comparison implies
1397 // subtraction. So we subtract modulus, see if it borrowed,
1398 // and conditionally copy original value.
1400 mov $ap_end,$t2 // $rp copy
1408 ldp $m0,$m1,[$np,#8*0]
1410 ldp $acc0,$acc1,[$tp,#8*0]
1412 ldp $m2,$m3,[$np,#8*2]
1414 ldp $acc2,$acc3,[$tp,#8*2]
1416 stp $t0,$t1,[$rp,#8*0]
1418 stp $t2,$t3,[$rp,#8*2]
1421 cbnz $cnt,.Lmul4x_sub
1426 ldp $a0,$a1,[$ap_end,#8*0]
1428 stp $t0,$t1,[$rp,#8*0]
1429 ldp $a2,$a3,[$ap_end,#8*2]
1430 stp $t2,$t3,[$rp,#8*2]
1431 ldp $acc0,$acc1,[$ap,#8*0]
1432 ldp $acc2,$acc3,[$ap,#8*2]
1433 sbcs xzr,$topmost,xzr // did it borrow?
1434 ldr x30,[x29,#8] // pull return address
1439 csel $t0,$acc0,$a0,lo
1440 stp xzr,xzr,[$tp,#8*0]
1441 csel $t1,$acc1,$a1,lo
1442 ldp $a0,$a1,[$ap_end,#8*4]
1443 ldp $acc0,$acc1,[$ap,#8*4]
1444 csel $t2,$acc2,$a2,lo
1445 stp xzr,xzr,[$tp,#8*2]
1447 csel $t3,$acc3,$a3,lo
1448 ldp $a2,$a3,[$ap_end,#8*6]
1449 ldp $acc2,$acc3,[$ap,#8*6]
1451 stp $t0,$t1,[$ap_end,#8*0]
1452 stp $t2,$t3,[$ap_end,#8*2]
1453 add $ap_end,$ap_end,#8*4
1454 cbnz $cnt,.Lmul4x_cond_copy
1456 csel $t0,$acc0,$a0,lo
1457 stp xzr,xzr,[$tp,#8*0]
1458 csel $t1,$acc1,$a1,lo
1459 stp xzr,xzr,[$tp,#8*2]
1460 csel $t2,$acc2,$a2,lo
1461 stp xzr,xzr,[$tp,#8*3]
1462 csel $t3,$acc3,$a3,lo
1463 stp xzr,xzr,[$tp,#8*4]
1464 stp $t0,$t1,[$ap_end,#8*0]
1465 stp $t2,$t3,[$ap_end,#8*2]
1470 .Lmul4x4_post_condition:
1471 adc $carry,$carry,xzr
1472 ldr $ap,[x29,#96] // pull rp
1473 // $acc0-3,$carry hold result, $m0-7 hold modulus
1475 ldr x30,[x29,#8] // pull return address
1477 stp xzr,xzr,[sp,#8*0]
1479 stp xzr,xzr,[sp,#8*2]
1481 stp xzr,xzr,[sp,#8*4]
1482 sbcs xzr,$carry,xzr // did it borrow?
1483 stp xzr,xzr,[sp,#8*6]
1485 // $a0-3 hold result-modulus
1486 csel $a0,$acc0,$a0,lo
1487 csel $a1,$acc1,$a1,lo
1488 csel $a2,$acc2,$a2,lo
1489 csel $a3,$acc3,$a3,lo
1490 stp $a0,$a1,[$ap,#8*0]
1491 stp $a2,$a3,[$ap,#8*2]
1494 ldp x19,x20,[x29,#16]
1496 ldp x21,x22,[x29,#32]
1498 ldp x23,x24,[x29,#48]
1499 ldp x25,x26,[x29,#64]
1500 ldp x27,x28,[x29,#80]
1502 .inst 0xd50323bf // autiasp
1504 .size __bn_mul4x_mont,.-__bn_mul4x_mont
1508 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"