]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - crypto/openssl/crypto/bn/asm/mo-586.pl
Fix OpenSSL multiple vulnerabilities. [13:03]
[FreeBSD/releng/9.0.git] / crypto / openssl / crypto / bn / asm / mo-586.pl
1 #!/usr/bin/env perl
2
3 # This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl)
4 # from OpenSSL 0.9.9-dev 
5
6 sub ::asciz
7 { my @str=unpack("C*",shift);
8     push @str,0;
9     while ($#str>15) {
10         &data_byte(@str[0..15]);
11         foreach (0..15) { shift @str; }
12     }
13     &data_byte(@str) if (@str);
14 }
15
16 # ====================================================================
17 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
18 # project. The module is, however, dual licensed under OpenSSL and
19 # CRYPTOGAMS licenses depending on where you obtain it. For further
20 # details see http://www.openssl.org/~appro/cryptogams/.
21 # ====================================================================
22
23 # October 2005
24 #
25 # This is a "teaser" code, as it can be improved in several ways...
26 # First of all non-SSE2 path should be implemented (yes, for now it
27 # performs Montgomery multiplication/convolution only on SSE2-capable
28 # CPUs such as P4, others fall down to original code). Then inner loop
29 # can be unrolled and modulo-scheduled to improve ILP and possibly
30 # moved to 128-bit XMM register bank (though it would require input
31 # rearrangement and/or increase bus bandwidth utilization). Dedicated
32 # squaring procedure should give further performance improvement...
33 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
34 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
35
36 # December 2006
37 #
38 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
39 # Integer-only code [being equipped with dedicated squaring procedure]
40 # gives ~40% on rsa512 sign benchmark...
41
42 push(@INC,"perlasm","../../perlasm");
43 require "x86asm.pl";
44
45 &asm_init($ARGV[0],$0);
46
47 $sse2=0;
48 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
49
50 &external_label("OPENSSL_ia32cap_P") if ($sse2);
51
52 &function_begin("bn_mul_mont");
53
54 $i="edx";
55 $j="ecx";
56 $ap="esi";      $tp="esi";              # overlapping variables!!!
57 $rp="edi";      $bp="edi";              # overlapping variables!!!
58 $np="ebp";
59 $num="ebx";
60
61 $_num=&DWP(4*0,"esp");                  # stack top layout
62 $_rp=&DWP(4*1,"esp");
63 $_ap=&DWP(4*2,"esp");
64 $_bp=&DWP(4*3,"esp");
65 $_np=&DWP(4*4,"esp");
66 $_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
67 $_sp=&DWP(4*6,"esp");
68 $_bpend=&DWP(4*7,"esp");
69 $frame=32;                              # size of above frame rounded up to 16n
70
71         &xor    ("eax","eax");
72         &mov    ("edi",&wparam(5));     # int num
73         &cmp    ("edi",4);
74         &jl     (&label("just_leave"));
75
76         &lea    ("esi",&wparam(0));     # put aside pointer to argument block
77         &lea    ("edx",&wparam(1));     # load ap
78         &mov    ("ebp","esp");          # saved stack pointer!
79         &add    ("edi",2);              # extra two words on top of tp
80         &neg    ("edi");
81         &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
82         &neg    ("edi");
83
84         # minimize cache contention by arraning 2K window between stack
85         # pointer and ap argument [np is also position sensitive vector,
86         # but it's assumed to be near ap, as it's allocated at ~same
87         # time].
88         &mov    ("eax","esp");
89         &sub    ("eax","edx");
90         &and    ("eax",2047);
91         &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
92
93         &xor    ("edx","esp");
94         &and    ("edx",2048);
95         &xor    ("edx",2048);
96         &sub    ("esp","edx");          # this splits them apart modulo 4096
97
98         &and    ("esp",-64);            # align to cache line
99
100         ################################# load argument block...
101         &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
102         &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
103         &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
104         &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
105         &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
106         #&mov   ("edi",&DWP(5*4,"esi"));# int num
107
108         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
109         &mov    ($_rp,"eax");           # ... save a copy of argument block
110         &mov    ($_ap,"ebx");
111         &mov    ($_bp,"ecx");
112         &mov    ($_np,"edx");
113         &mov    ($_n0,"esi");
114         &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
115         #&mov   ($_num,$num);           # redundant as $num is not reused
116         &mov    ($_sp,"ebp");           # saved stack pointer!
117 \f
118 if($sse2) {
119 $acc0="mm0";    # mmx register bank layout
120 $acc1="mm1";
121 $car0="mm2";
122 $car1="mm3";
123 $mul0="mm4";
124 $mul1="mm5";
125 $temp="mm6";
126 $mask="mm7";
127
128         &picmeup("eax","OPENSSL_ia32cap_P");
129         &bt     (&DWP(0,"eax"),26);
130         &jnc    (&label("non_sse2"));
131
132         &mov    ("eax",-1);
133         &movd   ($mask,"eax");          # mask 32 lower bits
134
135         &mov    ($ap,$_ap);             # load input pointers
136         &mov    ($bp,$_bp);
137         &mov    ($np,$_np);
138
139         &xor    ($i,$i);                # i=0
140         &xor    ($j,$j);                # j=0
141
142         &movd   ($mul0,&DWP(0,$bp));            # bp[0]
143         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
144         &movd   ($car1,&DWP(0,$np));            # np[0]
145
146         &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
147         &movq   ($car0,$mul1);
148         &movq   ($acc0,$mul1);                  # I wish movd worked for
149         &pand   ($acc0,$mask);                  # inter-register transfers
150
151         &pmuludq($mul1,$_n0q);                  # *=n0
152
153         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
154         &paddq  ($car1,$acc0);
155
156         &movd   ($acc1,&DWP(4,$np));            # np[1]
157         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
158
159         &psrlq  ($car0,32);
160         &psrlq  ($car1,32);
161
162         &inc    ($j);                           # j++
163 &set_label("1st",16);
164         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
165         &pmuludq($acc1,$mul1);                  # np[j]*m1
166         &paddq  ($car0,$acc0);                  # +=c0
167         &paddq  ($car1,$acc1);                  # +=c1
168
169         &movq   ($acc0,$car0);
170         &pand   ($acc0,$mask);
171         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
172         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
173         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
174         &psrlq  ($car0,32);
175         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
176         &psrlq  ($car1,32);
177
178         &lea    ($j,&DWP(1,$j));
179         &cmp    ($j,$num);
180         &jl     (&label("1st"));
181
182         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
183         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
184         &paddq  ($car0,$acc0);                  # +=c0
185         &paddq  ($car1,$acc1);                  # +=c1
186
187         &movq   ($acc0,$car0);
188         &pand   ($acc0,$mask);
189         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
190         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
191
192         &psrlq  ($car0,32);
193         &psrlq  ($car1,32);
194
195         &paddq  ($car1,$car0);
196         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
197 \f
198         &inc    ($i);                           # i++
199 &set_label("outer");
200         &xor    ($j,$j);                        # j=0
201
202         &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
203         &movd   ($mul1,&DWP(0,$ap));            # ap[0]
204         &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
205         &movd   ($car1,&DWP(0,$np));            # np[0]
206         &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
207
208         &paddq  ($mul1,$temp);                  # +=tp[0]
209         &movq   ($acc0,$mul1);
210         &movq   ($car0,$mul1);
211         &pand   ($acc0,$mask);
212
213         &pmuludq($mul1,$_n0q);                  # *=n0
214
215         &pmuludq($car1,$mul1);
216         &paddq  ($car1,$acc0);
217
218         &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
219         &movd   ($acc1,&DWP(4,$np));            # np[1]
220         &movd   ($acc0,&DWP(4,$ap));            # ap[1]
221
222         &psrlq  ($car0,32);
223         &psrlq  ($car1,32);
224         &paddq  ($car0,$temp);                  # +=tp[1]
225
226         &inc    ($j);                           # j++
227         &dec    ($num);
228 &set_label("inner");
229         &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
230         &pmuludq($acc1,$mul1);                  # np[j]*m1
231         &paddq  ($car0,$acc0);                  # +=c0
232         &paddq  ($car1,$acc1);                  # +=c1
233
234         &movq   ($acc0,$car0);
235         &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
236         &pand   ($acc0,$mask);
237         &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
238         &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
239         &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
240         &psrlq  ($car0,32);
241         &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
242         &psrlq  ($car1,32);
243         &paddq  ($car0,$temp);                  # +=tp[j+1]
244
245         &dec    ($num);
246         &lea    ($j,&DWP(1,$j));                # j++
247         &jnz    (&label("inner"));
248
249         &mov    ($num,$j);
250         &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
251         &pmuludq($acc1,$mul1);                  # np[num-1]*m1
252         &paddq  ($car0,$acc0);                  # +=c0
253         &paddq  ($car1,$acc1);                  # +=c1
254
255         &movq   ($acc0,$car0);
256         &pand   ($acc0,$mask);
257         &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
258         &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
259         &psrlq  ($car0,32);
260         &psrlq  ($car1,32);
261
262         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
263         &paddq  ($car1,$car0);
264         &paddq  ($car1,$temp);
265         &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
266
267         &lea    ($i,&DWP(1,$i));                # i++
268         &cmp    ($i,$num);
269         &jle    (&label("outer"));
270
271         &emms   ();                             # done with mmx bank
272         &jmp    (&label("common_tail"));
273
274 &set_label("non_sse2",16);
275 }
276 \f
277 if (0) {
278         &mov    ("esp",$_sp);
279         &xor    ("eax","eax");  # signal "not fast enough [yet]"
280         &jmp    (&label("just_leave"));
281         # While the below code provides competitive performance for
282         # all key lengthes on modern Intel cores, it's still more
283         # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
284         # means compared to the original integer-only assembler.
285         # 512-bit RSA sign is better by ~40%, but that's about all
286         # one can say about all CPUs...
287 } else {
288 $inp="esi";     # integer path uses these registers differently
289 $word="edi";
290 $carry="ebp";
291
292         &mov    ($inp,$_ap);
293         &lea    ($carry,&DWP(1,$num));
294         &mov    ($word,$_bp);
295         &xor    ($j,$j);                                # j=0
296         &mov    ("edx",$inp);
297         &and    ($carry,1);                             # see if num is even
298         &sub    ("edx",$word);                          # see if ap==bp
299         &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
300         &or     ($carry,"edx");
301         &mov    ($word,&DWP(0,$word));                  # bp[0]
302         &jz     (&label("bn_sqr_mont"));
303         &mov    ($_bpend,"eax");
304         &mov    ("eax",&DWP(0,$inp));
305         &xor    ("edx","edx");
306
307 &set_label("mull",16);
308         &mov    ($carry,"edx");
309         &mul    ($word);                                # ap[j]*bp[0]
310         &add    ($carry,"eax");
311         &lea    ($j,&DWP(1,$j));
312         &adc    ("edx",0);
313         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
314         &cmp    ($j,$num);
315         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
316         &jl     (&label("mull"));
317
318         &mov    ($carry,"edx");
319         &mul    ($word);                                # ap[num-1]*bp[0]
320          &mov   ($word,$_n0);
321         &add    ("eax",$carry);
322          &mov   ($inp,$_np);
323         &adc    ("edx",0);
324          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
325
326         &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
327         &xor    ($j,$j);
328         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
329         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
330
331         &mov    ("eax",&DWP(0,$inp));                   # np[0]
332         &mul    ($word);                                # np[0]*m
333         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
334         &mov    ("eax",&DWP(4,$inp));                   # np[1]
335         &adc    ("edx",0);
336         &inc    ($j);
337
338         &jmp    (&label("2ndmadd"));
339 \f\f
340 &set_label("1stmadd",16);
341         &mov    ($carry,"edx");
342         &mul    ($word);                                # ap[j]*bp[i]
343         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
344         &lea    ($j,&DWP(1,$j));
345         &adc    ("edx",0);
346         &add    ($carry,"eax");
347         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
348         &adc    ("edx",0);
349         &cmp    ($j,$num);
350         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
351         &jl     (&label("1stmadd"));
352
353         &mov    ($carry,"edx");
354         &mul    ($word);                                # ap[num-1]*bp[i]
355         &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
356          &mov   ($word,$_n0);
357         &adc    ("edx",0);
358          &mov   ($inp,$_np);
359         &add    ($carry,"eax");
360         &adc    ("edx",0);
361          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
362
363         &xor    ($j,$j);
364         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
365         &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
366         &adc    ($j,0);
367          &mov   ("eax",&DWP(0,$inp));                   # np[0]
368         &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
369         &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
370
371         &mul    ($word);                                # np[0]*m
372         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
373         &mov    ("eax",&DWP(4,$inp));                   # np[1]
374         &adc    ("edx",0);
375         &mov    ($j,1);
376 \f
377 &set_label("2ndmadd",16);
378         &mov    ($carry,"edx");
379         &mul    ($word);                                # np[j]*m
380         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
381         &lea    ($j,&DWP(1,$j));
382         &adc    ("edx",0);
383         &add    ($carry,"eax");
384         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
385         &adc    ("edx",0);
386         &cmp    ($j,$num);
387         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
388         &jl     (&label("2ndmadd"));
389
390         &mov    ($carry,"edx");
391         &mul    ($word);                                # np[j]*m
392         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
393         &adc    ("edx",0);
394         &add    ($carry,"eax");
395         &adc    ("edx",0);
396         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
397
398         &xor    ("eax","eax");
399          &mov   ($j,$_bp);                              # &bp[i]
400         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
401         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
402          &lea   ($j,&DWP(4,$j));
403         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
404          &cmp   ($j,$_bpend);
405         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
406         &je     (&label("common_tail"));
407
408         &mov    ($word,&DWP(0,$j));                     # bp[i+1]
409         &mov    ($inp,$_ap);
410         &mov    ($_bp,$j);                              # &bp[++i]
411         &xor    ($j,$j);
412         &xor    ("edx","edx");
413         &mov    ("eax",&DWP(0,$inp));
414         &jmp    (&label("1stmadd"));
415 \f
416 &set_label("bn_sqr_mont",16);
417 $sbit=$num;
418         &mov    ($_num,$num);
419         &mov    ($_bp,$j);                              # i=0
420
421         &mov    ("eax",$word);                          # ap[0]
422         &mul    ($word);                                # ap[0]*ap[0]
423         &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
424         &mov    ($sbit,"edx");
425         &shr    ("edx",1);
426         &and    ($sbit,1);
427         &inc    ($j);
428 &set_label("sqr",16);
429         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
430         &mov    ($carry,"edx");
431         &mul    ($word);                                # ap[j]*ap[0]
432         &add    ("eax",$carry);
433         &lea    ($j,&DWP(1,$j));
434         &adc    ("edx",0);
435         &lea    ($carry,&DWP(0,$sbit,"eax",2));
436         &shr    ("eax",31);
437         &cmp    ($j,$_num);
438         &mov    ($sbit,"eax");
439         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
440         &jl     (&label("sqr"));
441
442         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
443         &mov    ($carry,"edx");
444         &mul    ($word);                                # ap[num-1]*ap[0]
445         &add    ("eax",$carry);
446          &mov   ($word,$_n0);
447         &adc    ("edx",0);
448          &mov   ($inp,$_np);
449         &lea    ($carry,&DWP(0,$sbit,"eax",2));
450          &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
451         &shr    ("eax",31);
452         &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
453
454         &lea    ($carry,&DWP(0,"eax","edx",2));
455          &mov   ("eax",&DWP(0,$inp));                   # np[0]
456         &shr    ("edx",31);
457         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
458         &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
459
460         &mul    ($word);                                # np[0]*m
461         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
462         &mov    ($num,$j);
463         &adc    ("edx",0);
464         &mov    ("eax",&DWP(4,$inp));                   # np[1]
465         &mov    ($j,1);
466 \f\f
467 &set_label("3rdmadd",16);
468         &mov    ($carry,"edx");
469         &mul    ($word);                                # np[j]*m
470         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
471         &adc    ("edx",0);
472         &add    ($carry,"eax");
473         &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
474         &adc    ("edx",0);
475         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
476
477         &mov    ($carry,"edx");
478         &mul    ($word);                                # np[j+1]*m
479         &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
480         &lea    ($j,&DWP(2,$j));
481         &adc    ("edx",0);
482         &add    ($carry,"eax");
483         &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
484         &adc    ("edx",0);
485         &cmp    ($j,$num);
486         &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
487         &jl     (&label("3rdmadd"));
488
489         &mov    ($carry,"edx");
490         &mul    ($word);                                # np[j]*m
491         &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
492         &adc    ("edx",0);
493         &add    ($carry,"eax");
494         &adc    ("edx",0);
495         &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
496
497         &mov    ($j,$_bp);                              # i
498         &xor    ("eax","eax");
499         &mov    ($inp,$_ap);
500         &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
501         &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
502         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
503         &cmp    ($j,$num);
504         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
505         &je     (&label("common_tail"));
506 \f
507         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
508         &lea    ($j,&DWP(1,$j));
509         &mov    ("eax",$word);
510         &mov    ($_bp,$j);                              # ++i
511         &mul    ($word);                                # ap[i]*ap[i]
512         &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
513         &adc    ("edx",0);
514         &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
515         &xor    ($carry,$carry);
516         &cmp    ($j,$num);
517         &lea    ($j,&DWP(1,$j));
518         &je     (&label("sqrlast"));
519
520         &mov    ($sbit,"edx");                          # zaps $num
521         &shr    ("edx",1);
522         &and    ($sbit,1);
523 &set_label("sqradd",16);
524         &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
525         &mov    ($carry,"edx");
526         &mul    ($word);                                # ap[j]*ap[i]
527         &add    ("eax",$carry);
528         &lea    ($carry,&DWP(0,"eax","eax"));
529         &adc    ("edx",0);
530         &shr    ("eax",31);
531         &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
532         &lea    ($j,&DWP(1,$j));
533         &adc    ("eax",0);
534         &add    ($carry,$sbit);
535         &adc    ("eax",0);
536         &cmp    ($j,$_num);
537         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
538         &mov    ($sbit,"eax");
539         &jle    (&label("sqradd"));
540
541         &mov    ($carry,"edx");
542         &add    ("edx","edx");
543         &shr    ($carry,31);
544         &add    ("edx",$sbit);
545         &adc    ($carry,0);
546 &set_label("sqrlast");
547         &mov    ($word,$_n0);
548         &mov    ($inp,$_np);
549         &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
550
551         &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
552         &mov    ("eax",&DWP(0,$inp));                   # np[0]
553         &adc    ($carry,0);
554         &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
555         &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
556
557         &mul    ($word);                                # np[0]*m
558         &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
559         &lea    ($num,&DWP(-1,$j));
560         &adc    ("edx",0);
561         &mov    ($j,1);
562         &mov    ("eax",&DWP(4,$inp));                   # np[1]
563
564         &jmp    (&label("3rdmadd"));
565 }
566 \f
567 &set_label("common_tail",16);
568         &mov    ($np,$_np);                     # load modulus pointer
569         &mov    ($rp,$_rp);                     # load result pointer
570         &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
571
572         &mov    ("eax",&DWP(0,$tp));            # tp[0]
573         &mov    ($j,$num);                      # j=num-1
574         &xor    ($i,$i);                        # i=0 and clear CF!
575
576 &set_label("sub",16);
577         &sbb    ("eax",&DWP(0,$np,$i,4));
578         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
579         &dec    ($j);                           # doesn't affect CF!
580         &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
581         &lea    ($i,&DWP(1,$i));                # i++
582         &jge    (&label("sub"));
583
584         &sbb    ("eax",0);                      # handle upmost overflow bit
585         &and    ($tp,"eax");
586         &not    ("eax");
587         &mov    ($np,$rp);
588         &and    ($np,"eax");
589         &or     ($tp,$np);                      # tp=carry?tp:rp
590
591 &set_label("copy",16);                          # copy or in-place refresh
592         &mov    ("eax",&DWP(0,$tp,$num,4));
593         &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
594         &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
595         &dec    ($num);
596         &jge    (&label("copy"));
597
598         &mov    ("esp",$_sp);           # pull saved stack pointer
599         &mov    ("eax",1);
600 &set_label("just_leave");
601 &function_end("bn_mul_mont");
602
603 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
604
605 &asm_finish();