]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/amd64/x86_64-mont5.S
MFC: r325328
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / amd64 / x86_64-mont5.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
3 .text   
4
5
6
7 .globl  bn_mul_mont_gather5
8 .type   bn_mul_mont_gather5,@function
9 .align  64
10 bn_mul_mont_gather5:
11         movl    %r9d,%r9d
12         movq    %rsp,%rax
13         testl   $7,%r9d
14         jnz     .Lmul_enter
15         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
16         jmp     .Lmul4x_enter
17
18 .align  16
19 .Lmul_enter:
20         movd    8(%rsp),%xmm5
21         pushq   %rbx
22         pushq   %rbp
23         pushq   %r12
24         pushq   %r13
25         pushq   %r14
26         pushq   %r15
27
28         negq    %r9
29         movq    %rsp,%r11
30         leaq    -280(%rsp,%r9,8),%r10
31         negq    %r9
32         andq    $-1024,%r10
33
34
35
36
37
38
39
40         subq    %r10,%r11
41         andq    $-4096,%r11
42         leaq    (%r10,%r11,1),%rsp
43         movq    (%rsp),%r11
44         cmpq    %r10,%rsp
45         ja      .Lmul_page_walk
46         jmp     .Lmul_page_walk_done
47
48 .Lmul_page_walk:
49         leaq    -4096(%rsp),%rsp
50         movq    (%rsp),%r11
51         cmpq    %r10,%rsp
52         ja      .Lmul_page_walk
53 .Lmul_page_walk_done:
54
55         leaq    .Linc(%rip),%r10
56         movq    %rax,8(%rsp,%r9,8)
57 .Lmul_body:
58
59         leaq    128(%rdx),%r12
60         movdqa  0(%r10),%xmm0
61         movdqa  16(%r10),%xmm1
62         leaq    24-112(%rsp,%r9,8),%r10
63         andq    $-16,%r10
64
65         pshufd  $0,%xmm5,%xmm5
66         movdqa  %xmm1,%xmm4
67         movdqa  %xmm1,%xmm2
68         paddd   %xmm0,%xmm1
69         pcmpeqd %xmm5,%xmm0
70 .byte   0x67
71         movdqa  %xmm4,%xmm3
72         paddd   %xmm1,%xmm2
73         pcmpeqd %xmm5,%xmm1
74         movdqa  %xmm0,112(%r10)
75         movdqa  %xmm4,%xmm0
76
77         paddd   %xmm2,%xmm3
78         pcmpeqd %xmm5,%xmm2
79         movdqa  %xmm1,128(%r10)
80         movdqa  %xmm4,%xmm1
81
82         paddd   %xmm3,%xmm0
83         pcmpeqd %xmm5,%xmm3
84         movdqa  %xmm2,144(%r10)
85         movdqa  %xmm4,%xmm2
86
87         paddd   %xmm0,%xmm1
88         pcmpeqd %xmm5,%xmm0
89         movdqa  %xmm3,160(%r10)
90         movdqa  %xmm4,%xmm3
91         paddd   %xmm1,%xmm2
92         pcmpeqd %xmm5,%xmm1
93         movdqa  %xmm0,176(%r10)
94         movdqa  %xmm4,%xmm0
95
96         paddd   %xmm2,%xmm3
97         pcmpeqd %xmm5,%xmm2
98         movdqa  %xmm1,192(%r10)
99         movdqa  %xmm4,%xmm1
100
101         paddd   %xmm3,%xmm0
102         pcmpeqd %xmm5,%xmm3
103         movdqa  %xmm2,208(%r10)
104         movdqa  %xmm4,%xmm2
105
106         paddd   %xmm0,%xmm1
107         pcmpeqd %xmm5,%xmm0
108         movdqa  %xmm3,224(%r10)
109         movdqa  %xmm4,%xmm3
110         paddd   %xmm1,%xmm2
111         pcmpeqd %xmm5,%xmm1
112         movdqa  %xmm0,240(%r10)
113         movdqa  %xmm4,%xmm0
114
115         paddd   %xmm2,%xmm3
116         pcmpeqd %xmm5,%xmm2
117         movdqa  %xmm1,256(%r10)
118         movdqa  %xmm4,%xmm1
119
120         paddd   %xmm3,%xmm0
121         pcmpeqd %xmm5,%xmm3
122         movdqa  %xmm2,272(%r10)
123         movdqa  %xmm4,%xmm2
124
125         paddd   %xmm0,%xmm1
126         pcmpeqd %xmm5,%xmm0
127         movdqa  %xmm3,288(%r10)
128         movdqa  %xmm4,%xmm3
129         paddd   %xmm1,%xmm2
130         pcmpeqd %xmm5,%xmm1
131         movdqa  %xmm0,304(%r10)
132
133         paddd   %xmm2,%xmm3
134 .byte   0x67
135         pcmpeqd %xmm5,%xmm2
136         movdqa  %xmm1,320(%r10)
137
138         pcmpeqd %xmm5,%xmm3
139         movdqa  %xmm2,336(%r10)
140         pand    64(%r12),%xmm0
141
142         pand    80(%r12),%xmm1
143         pand    96(%r12),%xmm2
144         movdqa  %xmm3,352(%r10)
145         pand    112(%r12),%xmm3
146         por     %xmm2,%xmm0
147         por     %xmm3,%xmm1
148         movdqa  -128(%r12),%xmm4
149         movdqa  -112(%r12),%xmm5
150         movdqa  -96(%r12),%xmm2
151         pand    112(%r10),%xmm4
152         movdqa  -80(%r12),%xmm3
153         pand    128(%r10),%xmm5
154         por     %xmm4,%xmm0
155         pand    144(%r10),%xmm2
156         por     %xmm5,%xmm1
157         pand    160(%r10),%xmm3
158         por     %xmm2,%xmm0
159         por     %xmm3,%xmm1
160         movdqa  -64(%r12),%xmm4
161         movdqa  -48(%r12),%xmm5
162         movdqa  -32(%r12),%xmm2
163         pand    176(%r10),%xmm4
164         movdqa  -16(%r12),%xmm3
165         pand    192(%r10),%xmm5
166         por     %xmm4,%xmm0
167         pand    208(%r10),%xmm2
168         por     %xmm5,%xmm1
169         pand    224(%r10),%xmm3
170         por     %xmm2,%xmm0
171         por     %xmm3,%xmm1
172         movdqa  0(%r12),%xmm4
173         movdqa  16(%r12),%xmm5
174         movdqa  32(%r12),%xmm2
175         pand    240(%r10),%xmm4
176         movdqa  48(%r12),%xmm3
177         pand    256(%r10),%xmm5
178         por     %xmm4,%xmm0
179         pand    272(%r10),%xmm2
180         por     %xmm5,%xmm1
181         pand    288(%r10),%xmm3
182         por     %xmm2,%xmm0
183         por     %xmm3,%xmm1
184         por     %xmm1,%xmm0
185         pshufd  $0x4e,%xmm0,%xmm1
186         por     %xmm1,%xmm0
187         leaq    256(%r12),%r12
188 .byte   102,72,15,126,195
189
190         movq    (%r8),%r8
191         movq    (%rsi),%rax
192
193         xorq    %r14,%r14
194         xorq    %r15,%r15
195
196         movq    %r8,%rbp
197         mulq    %rbx
198         movq    %rax,%r10
199         movq    (%rcx),%rax
200
201         imulq   %r10,%rbp
202         movq    %rdx,%r11
203
204         mulq    %rbp
205         addq    %rax,%r10
206         movq    8(%rsi),%rax
207         adcq    $0,%rdx
208         movq    %rdx,%r13
209
210         leaq    1(%r15),%r15
211         jmp     .L1st_enter
212
213 .align  16
214 .L1st:
215         addq    %rax,%r13
216         movq    (%rsi,%r15,8),%rax
217         adcq    $0,%rdx
218         addq    %r11,%r13
219         movq    %r10,%r11
220         adcq    $0,%rdx
221         movq    %r13,-16(%rsp,%r15,8)
222         movq    %rdx,%r13
223
224 .L1st_enter:
225         mulq    %rbx
226         addq    %rax,%r11
227         movq    (%rcx,%r15,8),%rax
228         adcq    $0,%rdx
229         leaq    1(%r15),%r15
230         movq    %rdx,%r10
231
232         mulq    %rbp
233         cmpq    %r9,%r15
234         jne     .L1st
235
236
237         addq    %rax,%r13
238         adcq    $0,%rdx
239         addq    %r11,%r13
240         adcq    $0,%rdx
241         movq    %r13,-16(%rsp,%r9,8)
242         movq    %rdx,%r13
243         movq    %r10,%r11
244
245         xorq    %rdx,%rdx
246         addq    %r11,%r13
247         adcq    $0,%rdx
248         movq    %r13,-8(%rsp,%r9,8)
249         movq    %rdx,(%rsp,%r9,8)
250
251         leaq    1(%r14),%r14
252         jmp     .Louter
253 .align  16
254 .Louter:
255         leaq    24+128(%rsp,%r9,8),%rdx
256         andq    $-16,%rdx
257         pxor    %xmm4,%xmm4
258         pxor    %xmm5,%xmm5
259         movdqa  -128(%r12),%xmm0
260         movdqa  -112(%r12),%xmm1
261         movdqa  -96(%r12),%xmm2
262         movdqa  -80(%r12),%xmm3
263         pand    -128(%rdx),%xmm0
264         pand    -112(%rdx),%xmm1
265         por     %xmm0,%xmm4
266         pand    -96(%rdx),%xmm2
267         por     %xmm1,%xmm5
268         pand    -80(%rdx),%xmm3
269         por     %xmm2,%xmm4
270         por     %xmm3,%xmm5
271         movdqa  -64(%r12),%xmm0
272         movdqa  -48(%r12),%xmm1
273         movdqa  -32(%r12),%xmm2
274         movdqa  -16(%r12),%xmm3
275         pand    -64(%rdx),%xmm0
276         pand    -48(%rdx),%xmm1
277         por     %xmm0,%xmm4
278         pand    -32(%rdx),%xmm2
279         por     %xmm1,%xmm5
280         pand    -16(%rdx),%xmm3
281         por     %xmm2,%xmm4
282         por     %xmm3,%xmm5
283         movdqa  0(%r12),%xmm0
284         movdqa  16(%r12),%xmm1
285         movdqa  32(%r12),%xmm2
286         movdqa  48(%r12),%xmm3
287         pand    0(%rdx),%xmm0
288         pand    16(%rdx),%xmm1
289         por     %xmm0,%xmm4
290         pand    32(%rdx),%xmm2
291         por     %xmm1,%xmm5
292         pand    48(%rdx),%xmm3
293         por     %xmm2,%xmm4
294         por     %xmm3,%xmm5
295         movdqa  64(%r12),%xmm0
296         movdqa  80(%r12),%xmm1
297         movdqa  96(%r12),%xmm2
298         movdqa  112(%r12),%xmm3
299         pand    64(%rdx),%xmm0
300         pand    80(%rdx),%xmm1
301         por     %xmm0,%xmm4
302         pand    96(%rdx),%xmm2
303         por     %xmm1,%xmm5
304         pand    112(%rdx),%xmm3
305         por     %xmm2,%xmm4
306         por     %xmm3,%xmm5
307         por     %xmm5,%xmm4
308         pshufd  $0x4e,%xmm4,%xmm0
309         por     %xmm4,%xmm0
310         leaq    256(%r12),%r12
311
312         movq    (%rsi),%rax
313 .byte   102,72,15,126,195
314
315         xorq    %r15,%r15
316         movq    %r8,%rbp
317         movq    (%rsp),%r10
318
319         mulq    %rbx
320         addq    %rax,%r10
321         movq    (%rcx),%rax
322         adcq    $0,%rdx
323
324         imulq   %r10,%rbp
325         movq    %rdx,%r11
326
327         mulq    %rbp
328         addq    %rax,%r10
329         movq    8(%rsi),%rax
330         adcq    $0,%rdx
331         movq    8(%rsp),%r10
332         movq    %rdx,%r13
333
334         leaq    1(%r15),%r15
335         jmp     .Linner_enter
336
337 .align  16
338 .Linner:
339         addq    %rax,%r13
340         movq    (%rsi,%r15,8),%rax
341         adcq    $0,%rdx
342         addq    %r10,%r13
343         movq    (%rsp,%r15,8),%r10
344         adcq    $0,%rdx
345         movq    %r13,-16(%rsp,%r15,8)
346         movq    %rdx,%r13
347
348 .Linner_enter:
349         mulq    %rbx
350         addq    %rax,%r11
351         movq    (%rcx,%r15,8),%rax
352         adcq    $0,%rdx
353         addq    %r11,%r10
354         movq    %rdx,%r11
355         adcq    $0,%r11
356         leaq    1(%r15),%r15
357
358         mulq    %rbp
359         cmpq    %r9,%r15
360         jne     .Linner
361
362         addq    %rax,%r13
363         adcq    $0,%rdx
364         addq    %r10,%r13
365         movq    (%rsp,%r9,8),%r10
366         adcq    $0,%rdx
367         movq    %r13,-16(%rsp,%r9,8)
368         movq    %rdx,%r13
369
370         xorq    %rdx,%rdx
371         addq    %r11,%r13
372         adcq    $0,%rdx
373         addq    %r10,%r13
374         adcq    $0,%rdx
375         movq    %r13,-8(%rsp,%r9,8)
376         movq    %rdx,(%rsp,%r9,8)
377
378         leaq    1(%r14),%r14
379         cmpq    %r9,%r14
380         jb      .Louter
381
382         xorq    %r14,%r14
383         movq    (%rsp),%rax
384         leaq    (%rsp),%rsi
385         movq    %r9,%r15
386         jmp     .Lsub
387 .align  16
388 .Lsub:  sbbq    (%rcx,%r14,8),%rax
389         movq    %rax,(%rdi,%r14,8)
390         movq    8(%rsi,%r14,8),%rax
391         leaq    1(%r14),%r14
392         decq    %r15
393         jnz     .Lsub
394
395         sbbq    $0,%rax
396         xorq    %r14,%r14
397         andq    %rax,%rsi
398         notq    %rax
399         movq    %rdi,%rcx
400         andq    %rax,%rcx
401         movq    %r9,%r15
402         orq     %rcx,%rsi
403 .align  16
404 .Lcopy:
405         movq    (%rsi,%r14,8),%rax
406         movq    %r14,(%rsp,%r14,8)
407         movq    %rax,(%rdi,%r14,8)
408         leaq    1(%r14),%r14
409         subq    $1,%r15
410         jnz     .Lcopy
411
412         movq    8(%rsp,%r9,8),%rsi
413         movq    $1,%rax
414
415         movq    -48(%rsi),%r15
416         movq    -40(%rsi),%r14
417         movq    -32(%rsi),%r13
418         movq    -24(%rsi),%r12
419         movq    -16(%rsi),%rbp
420         movq    -8(%rsi),%rbx
421         leaq    (%rsi),%rsp
422 .Lmul_epilogue:
423         .byte   0xf3,0xc3
424 .size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
425 .type   bn_mul4x_mont_gather5,@function
426 .align  32
427 bn_mul4x_mont_gather5:
428 .byte   0x67
429         movq    %rsp,%rax
430 .Lmul4x_enter:
431         andl    $0x80108,%r11d
432         cmpl    $0x80108,%r11d
433         je      .Lmulx4x_enter
434         pushq   %rbx
435         pushq   %rbp
436         pushq   %r12
437         pushq   %r13
438         pushq   %r14
439         pushq   %r15
440 .Lmul4x_prologue:
441
442 .byte   0x67
443         shll    $3,%r9d
444         leaq    (%r9,%r9,2),%r10
445         negq    %r9
446
447
448
449
450
451
452
453
454
455
456         leaq    -320(%rsp,%r9,2),%r11
457         movq    %rsp,%rbp
458         subq    %rdi,%r11
459         andq    $4095,%r11
460         cmpq    %r11,%r10
461         jb      .Lmul4xsp_alt
462         subq    %r11,%rbp
463         leaq    -320(%rbp,%r9,2),%rbp
464         jmp     .Lmul4xsp_done
465
466 .align  32
467 .Lmul4xsp_alt:
468         leaq    4096-320(,%r9,2),%r10
469         leaq    -320(%rbp,%r9,2),%rbp
470         subq    %r10,%r11
471         movq    $0,%r10
472         cmovcq  %r10,%r11
473         subq    %r11,%rbp
474 .Lmul4xsp_done:
475         andq    $-64,%rbp
476         movq    %rsp,%r11
477         subq    %rbp,%r11
478         andq    $-4096,%r11
479         leaq    (%r11,%rbp,1),%rsp
480         movq    (%rsp),%r10
481         cmpq    %rbp,%rsp
482         ja      .Lmul4x_page_walk
483         jmp     .Lmul4x_page_walk_done
484
485 .Lmul4x_page_walk:
486         leaq    -4096(%rsp),%rsp
487         movq    (%rsp),%r10
488         cmpq    %rbp,%rsp
489         ja      .Lmul4x_page_walk
490 .Lmul4x_page_walk_done:
491
492         negq    %r9
493
494         movq    %rax,40(%rsp)
495 .Lmul4x_body:
496
497         call    mul4x_internal
498
499         movq    40(%rsp),%rsi
500         movq    $1,%rax
501
502         movq    -48(%rsi),%r15
503         movq    -40(%rsi),%r14
504         movq    -32(%rsi),%r13
505         movq    -24(%rsi),%r12
506         movq    -16(%rsi),%rbp
507         movq    -8(%rsi),%rbx
508         leaq    (%rsi),%rsp
509 .Lmul4x_epilogue:
510         .byte   0xf3,0xc3
511 .size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
512
513 .type   mul4x_internal,@function
514 .align  32
515 mul4x_internal:
516         shlq    $5,%r9
517         movd    8(%rax),%xmm5
518         leaq    .Linc(%rip),%rax
519         leaq    128(%rdx,%r9,1),%r13
520         shrq    $5,%r9
521         movdqa  0(%rax),%xmm0
522         movdqa  16(%rax),%xmm1
523         leaq    88-112(%rsp,%r9,1),%r10
524         leaq    128(%rdx),%r12
525
526         pshufd  $0,%xmm5,%xmm5
527         movdqa  %xmm1,%xmm4
528 .byte   0x67,0x67
529         movdqa  %xmm1,%xmm2
530         paddd   %xmm0,%xmm1
531         pcmpeqd %xmm5,%xmm0
532 .byte   0x67
533         movdqa  %xmm4,%xmm3
534         paddd   %xmm1,%xmm2
535         pcmpeqd %xmm5,%xmm1
536         movdqa  %xmm0,112(%r10)
537         movdqa  %xmm4,%xmm0
538
539         paddd   %xmm2,%xmm3
540         pcmpeqd %xmm5,%xmm2
541         movdqa  %xmm1,128(%r10)
542         movdqa  %xmm4,%xmm1
543
544         paddd   %xmm3,%xmm0
545         pcmpeqd %xmm5,%xmm3
546         movdqa  %xmm2,144(%r10)
547         movdqa  %xmm4,%xmm2
548
549         paddd   %xmm0,%xmm1
550         pcmpeqd %xmm5,%xmm0
551         movdqa  %xmm3,160(%r10)
552         movdqa  %xmm4,%xmm3
553         paddd   %xmm1,%xmm2
554         pcmpeqd %xmm5,%xmm1
555         movdqa  %xmm0,176(%r10)
556         movdqa  %xmm4,%xmm0
557
558         paddd   %xmm2,%xmm3
559         pcmpeqd %xmm5,%xmm2
560         movdqa  %xmm1,192(%r10)
561         movdqa  %xmm4,%xmm1
562
563         paddd   %xmm3,%xmm0
564         pcmpeqd %xmm5,%xmm3
565         movdqa  %xmm2,208(%r10)
566         movdqa  %xmm4,%xmm2
567
568         paddd   %xmm0,%xmm1
569         pcmpeqd %xmm5,%xmm0
570         movdqa  %xmm3,224(%r10)
571         movdqa  %xmm4,%xmm3
572         paddd   %xmm1,%xmm2
573         pcmpeqd %xmm5,%xmm1
574         movdqa  %xmm0,240(%r10)
575         movdqa  %xmm4,%xmm0
576
577         paddd   %xmm2,%xmm3
578         pcmpeqd %xmm5,%xmm2
579         movdqa  %xmm1,256(%r10)
580         movdqa  %xmm4,%xmm1
581
582         paddd   %xmm3,%xmm0
583         pcmpeqd %xmm5,%xmm3
584         movdqa  %xmm2,272(%r10)
585         movdqa  %xmm4,%xmm2
586
587         paddd   %xmm0,%xmm1
588         pcmpeqd %xmm5,%xmm0
589         movdqa  %xmm3,288(%r10)
590         movdqa  %xmm4,%xmm3
591         paddd   %xmm1,%xmm2
592         pcmpeqd %xmm5,%xmm1
593         movdqa  %xmm0,304(%r10)
594
595         paddd   %xmm2,%xmm3
596 .byte   0x67
597         pcmpeqd %xmm5,%xmm2
598         movdqa  %xmm1,320(%r10)
599
600         pcmpeqd %xmm5,%xmm3
601         movdqa  %xmm2,336(%r10)
602         pand    64(%r12),%xmm0
603
604         pand    80(%r12),%xmm1
605         pand    96(%r12),%xmm2
606         movdqa  %xmm3,352(%r10)
607         pand    112(%r12),%xmm3
608         por     %xmm2,%xmm0
609         por     %xmm3,%xmm1
610         movdqa  -128(%r12),%xmm4
611         movdqa  -112(%r12),%xmm5
612         movdqa  -96(%r12),%xmm2
613         pand    112(%r10),%xmm4
614         movdqa  -80(%r12),%xmm3
615         pand    128(%r10),%xmm5
616         por     %xmm4,%xmm0
617         pand    144(%r10),%xmm2
618         por     %xmm5,%xmm1
619         pand    160(%r10),%xmm3
620         por     %xmm2,%xmm0
621         por     %xmm3,%xmm1
622         movdqa  -64(%r12),%xmm4
623         movdqa  -48(%r12),%xmm5
624         movdqa  -32(%r12),%xmm2
625         pand    176(%r10),%xmm4
626         movdqa  -16(%r12),%xmm3
627         pand    192(%r10),%xmm5
628         por     %xmm4,%xmm0
629         pand    208(%r10),%xmm2
630         por     %xmm5,%xmm1
631         pand    224(%r10),%xmm3
632         por     %xmm2,%xmm0
633         por     %xmm3,%xmm1
634         movdqa  0(%r12),%xmm4
635         movdqa  16(%r12),%xmm5
636         movdqa  32(%r12),%xmm2
637         pand    240(%r10),%xmm4
638         movdqa  48(%r12),%xmm3
639         pand    256(%r10),%xmm5
640         por     %xmm4,%xmm0
641         pand    272(%r10),%xmm2
642         por     %xmm5,%xmm1
643         pand    288(%r10),%xmm3
644         por     %xmm2,%xmm0
645         por     %xmm3,%xmm1
646         por     %xmm1,%xmm0
647         pshufd  $0x4e,%xmm0,%xmm1
648         por     %xmm1,%xmm0
649         leaq    256(%r12),%r12
650 .byte   102,72,15,126,195
651
652         movq    %r13,16+8(%rsp)
653         movq    %rdi,56+8(%rsp)
654
655         movq    (%r8),%r8
656         movq    (%rsi),%rax
657         leaq    (%rsi,%r9,1),%rsi
658         negq    %r9
659
660         movq    %r8,%rbp
661         mulq    %rbx
662         movq    %rax,%r10
663         movq    (%rcx),%rax
664
665         imulq   %r10,%rbp
666         leaq    64+8(%rsp),%r14
667         movq    %rdx,%r11
668
669         mulq    %rbp
670         addq    %rax,%r10
671         movq    8(%rsi,%r9,1),%rax
672         adcq    $0,%rdx
673         movq    %rdx,%rdi
674
675         mulq    %rbx
676         addq    %rax,%r11
677         movq    8(%rcx),%rax
678         adcq    $0,%rdx
679         movq    %rdx,%r10
680
681         mulq    %rbp
682         addq    %rax,%rdi
683         movq    16(%rsi,%r9,1),%rax
684         adcq    $0,%rdx
685         addq    %r11,%rdi
686         leaq    32(%r9),%r15
687         leaq    32(%rcx),%rcx
688         adcq    $0,%rdx
689         movq    %rdi,(%r14)
690         movq    %rdx,%r13
691         jmp     .L1st4x
692
693 .align  32
694 .L1st4x:
695         mulq    %rbx
696         addq    %rax,%r10
697         movq    -16(%rcx),%rax
698         leaq    32(%r14),%r14
699         adcq    $0,%rdx
700         movq    %rdx,%r11
701
702         mulq    %rbp
703         addq    %rax,%r13
704         movq    -8(%rsi,%r15,1),%rax
705         adcq    $0,%rdx
706         addq    %r10,%r13
707         adcq    $0,%rdx
708         movq    %r13,-24(%r14)
709         movq    %rdx,%rdi
710
711         mulq    %rbx
712         addq    %rax,%r11
713         movq    -8(%rcx),%rax
714         adcq    $0,%rdx
715         movq    %rdx,%r10
716
717         mulq    %rbp
718         addq    %rax,%rdi
719         movq    (%rsi,%r15,1),%rax
720         adcq    $0,%rdx
721         addq    %r11,%rdi
722         adcq    $0,%rdx
723         movq    %rdi,-16(%r14)
724         movq    %rdx,%r13
725
726         mulq    %rbx
727         addq    %rax,%r10
728         movq    0(%rcx),%rax
729         adcq    $0,%rdx
730         movq    %rdx,%r11
731
732         mulq    %rbp
733         addq    %rax,%r13
734         movq    8(%rsi,%r15,1),%rax
735         adcq    $0,%rdx
736         addq    %r10,%r13
737         adcq    $0,%rdx
738         movq    %r13,-8(%r14)
739         movq    %rdx,%rdi
740
741         mulq    %rbx
742         addq    %rax,%r11
743         movq    8(%rcx),%rax
744         adcq    $0,%rdx
745         movq    %rdx,%r10
746
747         mulq    %rbp
748         addq    %rax,%rdi
749         movq    16(%rsi,%r15,1),%rax
750         adcq    $0,%rdx
751         addq    %r11,%rdi
752         leaq    32(%rcx),%rcx
753         adcq    $0,%rdx
754         movq    %rdi,(%r14)
755         movq    %rdx,%r13
756
757         addq    $32,%r15
758         jnz     .L1st4x
759
760         mulq    %rbx
761         addq    %rax,%r10
762         movq    -16(%rcx),%rax
763         leaq    32(%r14),%r14
764         adcq    $0,%rdx
765         movq    %rdx,%r11
766
767         mulq    %rbp
768         addq    %rax,%r13
769         movq    -8(%rsi),%rax
770         adcq    $0,%rdx
771         addq    %r10,%r13
772         adcq    $0,%rdx
773         movq    %r13,-24(%r14)
774         movq    %rdx,%rdi
775
776         mulq    %rbx
777         addq    %rax,%r11
778         movq    -8(%rcx),%rax
779         adcq    $0,%rdx
780         movq    %rdx,%r10
781
782         mulq    %rbp
783         addq    %rax,%rdi
784         movq    (%rsi,%r9,1),%rax
785         adcq    $0,%rdx
786         addq    %r11,%rdi
787         adcq    $0,%rdx
788         movq    %rdi,-16(%r14)
789         movq    %rdx,%r13
790
791         leaq    (%rcx,%r9,1),%rcx
792
793         xorq    %rdi,%rdi
794         addq    %r10,%r13
795         adcq    $0,%rdi
796         movq    %r13,-8(%r14)
797
798         jmp     .Louter4x
799
800 .align  32
801 .Louter4x:
802         leaq    16+128(%r14),%rdx
803         pxor    %xmm4,%xmm4
804         pxor    %xmm5,%xmm5
805         movdqa  -128(%r12),%xmm0
806         movdqa  -112(%r12),%xmm1
807         movdqa  -96(%r12),%xmm2
808         movdqa  -80(%r12),%xmm3
809         pand    -128(%rdx),%xmm0
810         pand    -112(%rdx),%xmm1
811         por     %xmm0,%xmm4
812         pand    -96(%rdx),%xmm2
813         por     %xmm1,%xmm5
814         pand    -80(%rdx),%xmm3
815         por     %xmm2,%xmm4
816         por     %xmm3,%xmm5
817         movdqa  -64(%r12),%xmm0
818         movdqa  -48(%r12),%xmm1
819         movdqa  -32(%r12),%xmm2
820         movdqa  -16(%r12),%xmm3
821         pand    -64(%rdx),%xmm0
822         pand    -48(%rdx),%xmm1
823         por     %xmm0,%xmm4
824         pand    -32(%rdx),%xmm2
825         por     %xmm1,%xmm5
826         pand    -16(%rdx),%xmm3
827         por     %xmm2,%xmm4
828         por     %xmm3,%xmm5
829         movdqa  0(%r12),%xmm0
830         movdqa  16(%r12),%xmm1
831         movdqa  32(%r12),%xmm2
832         movdqa  48(%r12),%xmm3
833         pand    0(%rdx),%xmm0
834         pand    16(%rdx),%xmm1
835         por     %xmm0,%xmm4
836         pand    32(%rdx),%xmm2
837         por     %xmm1,%xmm5
838         pand    48(%rdx),%xmm3
839         por     %xmm2,%xmm4
840         por     %xmm3,%xmm5
841         movdqa  64(%r12),%xmm0
842         movdqa  80(%r12),%xmm1
843         movdqa  96(%r12),%xmm2
844         movdqa  112(%r12),%xmm3
845         pand    64(%rdx),%xmm0
846         pand    80(%rdx),%xmm1
847         por     %xmm0,%xmm4
848         pand    96(%rdx),%xmm2
849         por     %xmm1,%xmm5
850         pand    112(%rdx),%xmm3
851         por     %xmm2,%xmm4
852         por     %xmm3,%xmm5
853         por     %xmm5,%xmm4
854         pshufd  $0x4e,%xmm4,%xmm0
855         por     %xmm4,%xmm0
856         leaq    256(%r12),%r12
857 .byte   102,72,15,126,195
858
859         movq    (%r14,%r9,1),%r10
860         movq    %r8,%rbp
861         mulq    %rbx
862         addq    %rax,%r10
863         movq    (%rcx),%rax
864         adcq    $0,%rdx
865
866         imulq   %r10,%rbp
867         movq    %rdx,%r11
868         movq    %rdi,(%r14)
869
870         leaq    (%r14,%r9,1),%r14
871
872         mulq    %rbp
873         addq    %rax,%r10
874         movq    8(%rsi,%r9,1),%rax
875         adcq    $0,%rdx
876         movq    %rdx,%rdi
877
878         mulq    %rbx
879         addq    %rax,%r11
880         movq    8(%rcx),%rax
881         adcq    $0,%rdx
882         addq    8(%r14),%r11
883         adcq    $0,%rdx
884         movq    %rdx,%r10
885
886         mulq    %rbp
887         addq    %rax,%rdi
888         movq    16(%rsi,%r9,1),%rax
889         adcq    $0,%rdx
890         addq    %r11,%rdi
891         leaq    32(%r9),%r15
892         leaq    32(%rcx),%rcx
893         adcq    $0,%rdx
894         movq    %rdx,%r13
895         jmp     .Linner4x
896
897 .align  32
898 .Linner4x:
899         mulq    %rbx
900         addq    %rax,%r10
901         movq    -16(%rcx),%rax
902         adcq    $0,%rdx
903         addq    16(%r14),%r10
904         leaq    32(%r14),%r14
905         adcq    $0,%rdx
906         movq    %rdx,%r11
907
908         mulq    %rbp
909         addq    %rax,%r13
910         movq    -8(%rsi,%r15,1),%rax
911         adcq    $0,%rdx
912         addq    %r10,%r13
913         adcq    $0,%rdx
914         movq    %rdi,-32(%r14)
915         movq    %rdx,%rdi
916
917         mulq    %rbx
918         addq    %rax,%r11
919         movq    -8(%rcx),%rax
920         adcq    $0,%rdx
921         addq    -8(%r14),%r11
922         adcq    $0,%rdx
923         movq    %rdx,%r10
924
925         mulq    %rbp
926         addq    %rax,%rdi
927         movq    (%rsi,%r15,1),%rax
928         adcq    $0,%rdx
929         addq    %r11,%rdi
930         adcq    $0,%rdx
931         movq    %r13,-24(%r14)
932         movq    %rdx,%r13
933
934         mulq    %rbx
935         addq    %rax,%r10
936         movq    0(%rcx),%rax
937         adcq    $0,%rdx
938         addq    (%r14),%r10
939         adcq    $0,%rdx
940         movq    %rdx,%r11
941
942         mulq    %rbp
943         addq    %rax,%r13
944         movq    8(%rsi,%r15,1),%rax
945         adcq    $0,%rdx
946         addq    %r10,%r13
947         adcq    $0,%rdx
948         movq    %rdi,-16(%r14)
949         movq    %rdx,%rdi
950
951         mulq    %rbx
952         addq    %rax,%r11
953         movq    8(%rcx),%rax
954         adcq    $0,%rdx
955         addq    8(%r14),%r11
956         adcq    $0,%rdx
957         movq    %rdx,%r10
958
959         mulq    %rbp
960         addq    %rax,%rdi
961         movq    16(%rsi,%r15,1),%rax
962         adcq    $0,%rdx
963         addq    %r11,%rdi
964         leaq    32(%rcx),%rcx
965         adcq    $0,%rdx
966         movq    %r13,-8(%r14)
967         movq    %rdx,%r13
968
969         addq    $32,%r15
970         jnz     .Linner4x
971
972         mulq    %rbx
973         addq    %rax,%r10
974         movq    -16(%rcx),%rax
975         adcq    $0,%rdx
976         addq    16(%r14),%r10
977         leaq    32(%r14),%r14
978         adcq    $0,%rdx
979         movq    %rdx,%r11
980
981         mulq    %rbp
982         addq    %rax,%r13
983         movq    -8(%rsi),%rax
984         adcq    $0,%rdx
985         addq    %r10,%r13
986         adcq    $0,%rdx
987         movq    %rdi,-32(%r14)
988         movq    %rdx,%rdi
989
990         mulq    %rbx
991         addq    %rax,%r11
992         movq    %rbp,%rax
993         movq    -8(%rcx),%rbp
994         adcq    $0,%rdx
995         addq    -8(%r14),%r11
996         adcq    $0,%rdx
997         movq    %rdx,%r10
998
999         mulq    %rbp
1000         addq    %rax,%rdi
1001         movq    (%rsi,%r9,1),%rax
1002         adcq    $0,%rdx
1003         addq    %r11,%rdi
1004         adcq    $0,%rdx
1005         movq    %r13,-24(%r14)
1006         movq    %rdx,%r13
1007
1008         movq    %rdi,-16(%r14)
1009         leaq    (%rcx,%r9,1),%rcx
1010
1011         xorq    %rdi,%rdi
1012         addq    %r10,%r13
1013         adcq    $0,%rdi
1014         addq    (%r14),%r13
1015         adcq    $0,%rdi
1016         movq    %r13,-8(%r14)
1017
1018         cmpq    16+8(%rsp),%r12
1019         jb      .Louter4x
1020         xorq    %rax,%rax
1021         subq    %r13,%rbp
1022         adcq    %r15,%r15
1023         orq     %r15,%rdi
1024         subq    %rdi,%rax
1025         leaq    (%r14,%r9,1),%rbx
1026         movq    (%rcx),%r12
1027         leaq    (%rcx),%rbp
1028         movq    %r9,%rcx
1029         sarq    $3+2,%rcx
1030         movq    56+8(%rsp),%rdi
1031         decq    %r12
1032         xorq    %r10,%r10
1033         movq    8(%rbp),%r13
1034         movq    16(%rbp),%r14
1035         movq    24(%rbp),%r15
1036         jmp     .Lsqr4x_sub_entry
1037 .size   mul4x_internal,.-mul4x_internal
1038 .globl  bn_power5
1039 .type   bn_power5,@function
1040 .align  32
1041 bn_power5:
1042         movq    %rsp,%rax
1043         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
1044         andl    $0x80108,%r11d
1045         cmpl    $0x80108,%r11d
1046         je      .Lpowerx5_enter
1047         pushq   %rbx
1048         pushq   %rbp
1049         pushq   %r12
1050         pushq   %r13
1051         pushq   %r14
1052         pushq   %r15
1053 .Lpower5_prologue:
1054
1055         shll    $3,%r9d
1056         leal    (%r9,%r9,2),%r10d
1057         negq    %r9
1058         movq    (%r8),%r8
1059
1060
1061
1062
1063
1064
1065
1066
1067         leaq    -320(%rsp,%r9,2),%r11
1068         movq    %rsp,%rbp
1069         subq    %rdi,%r11
1070         andq    $4095,%r11
1071         cmpq    %r11,%r10
1072         jb      .Lpwr_sp_alt
1073         subq    %r11,%rbp
1074         leaq    -320(%rbp,%r9,2),%rbp
1075         jmp     .Lpwr_sp_done
1076
1077 .align  32
1078 .Lpwr_sp_alt:
1079         leaq    4096-320(,%r9,2),%r10
1080         leaq    -320(%rbp,%r9,2),%rbp
1081         subq    %r10,%r11
1082         movq    $0,%r10
1083         cmovcq  %r10,%r11
1084         subq    %r11,%rbp
1085 .Lpwr_sp_done:
1086         andq    $-64,%rbp
1087         movq    %rsp,%r11
1088         subq    %rbp,%r11
1089         andq    $-4096,%r11
1090         leaq    (%r11,%rbp,1),%rsp
1091         movq    (%rsp),%r10
1092         cmpq    %rbp,%rsp
1093         ja      .Lpwr_page_walk
1094         jmp     .Lpwr_page_walk_done
1095
1096 .Lpwr_page_walk:
1097         leaq    -4096(%rsp),%rsp
1098         movq    (%rsp),%r10
1099         cmpq    %rbp,%rsp
1100         ja      .Lpwr_page_walk
1101 .Lpwr_page_walk_done:
1102
1103         movq    %r9,%r10
1104         negq    %r9
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115         movq    %r8,32(%rsp)
1116         movq    %rax,40(%rsp)
1117 .Lpower5_body:
1118 .byte   102,72,15,110,207
1119 .byte   102,72,15,110,209
1120 .byte   102,73,15,110,218
1121 .byte   102,72,15,110,226
1122
1123         call    __bn_sqr8x_internal
1124         call    __bn_post4x_internal
1125         call    __bn_sqr8x_internal
1126         call    __bn_post4x_internal
1127         call    __bn_sqr8x_internal
1128         call    __bn_post4x_internal
1129         call    __bn_sqr8x_internal
1130         call    __bn_post4x_internal
1131         call    __bn_sqr8x_internal
1132         call    __bn_post4x_internal
1133
1134 .byte   102,72,15,126,209
1135 .byte   102,72,15,126,226
1136         movq    %rsi,%rdi
1137         movq    40(%rsp),%rax
1138         leaq    32(%rsp),%r8
1139
1140         call    mul4x_internal
1141
1142         movq    40(%rsp),%rsi
1143         movq    $1,%rax
1144         movq    -48(%rsi),%r15
1145         movq    -40(%rsi),%r14
1146         movq    -32(%rsi),%r13
1147         movq    -24(%rsi),%r12
1148         movq    -16(%rsi),%rbp
1149         movq    -8(%rsi),%rbx
1150         leaq    (%rsi),%rsp
1151 .Lpower5_epilogue:
1152         .byte   0xf3,0xc3
1153 .size   bn_power5,.-bn_power5
1154
1155 .globl  bn_sqr8x_internal
1156 .hidden bn_sqr8x_internal
1157 .type   bn_sqr8x_internal,@function
1158 .align  32
1159 bn_sqr8x_internal:
1160 __bn_sqr8x_internal:
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234         leaq    32(%r10),%rbp
1235         leaq    (%rsi,%r9,1),%rsi
1236
1237         movq    %r9,%rcx
1238
1239
1240         movq    -32(%rsi,%rbp,1),%r14
1241         leaq    48+8(%rsp,%r9,2),%rdi
1242         movq    -24(%rsi,%rbp,1),%rax
1243         leaq    -32(%rdi,%rbp,1),%rdi
1244         movq    -16(%rsi,%rbp,1),%rbx
1245         movq    %rax,%r15
1246
1247         mulq    %r14
1248         movq    %rax,%r10
1249         movq    %rbx,%rax
1250         movq    %rdx,%r11
1251         movq    %r10,-24(%rdi,%rbp,1)
1252
1253         mulq    %r14
1254         addq    %rax,%r11
1255         movq    %rbx,%rax
1256         adcq    $0,%rdx
1257         movq    %r11,-16(%rdi,%rbp,1)
1258         movq    %rdx,%r10
1259
1260
1261         movq    -8(%rsi,%rbp,1),%rbx
1262         mulq    %r15
1263         movq    %rax,%r12
1264         movq    %rbx,%rax
1265         movq    %rdx,%r13
1266
1267         leaq    (%rbp),%rcx
1268         mulq    %r14
1269         addq    %rax,%r10
1270         movq    %rbx,%rax
1271         movq    %rdx,%r11
1272         adcq    $0,%r11
1273         addq    %r12,%r10
1274         adcq    $0,%r11
1275         movq    %r10,-8(%rdi,%rcx,1)
1276         jmp     .Lsqr4x_1st
1277
1278 .align  32
1279 .Lsqr4x_1st:
1280         movq    (%rsi,%rcx,1),%rbx
1281         mulq    %r15
1282         addq    %rax,%r13
1283         movq    %rbx,%rax
1284         movq    %rdx,%r12
1285         adcq    $0,%r12
1286
1287         mulq    %r14
1288         addq    %rax,%r11
1289         movq    %rbx,%rax
1290         movq    8(%rsi,%rcx,1),%rbx
1291         movq    %rdx,%r10
1292         adcq    $0,%r10
1293         addq    %r13,%r11
1294         adcq    $0,%r10
1295
1296
1297         mulq    %r15
1298         addq    %rax,%r12
1299         movq    %rbx,%rax
1300         movq    %r11,(%rdi,%rcx,1)
1301         movq    %rdx,%r13
1302         adcq    $0,%r13
1303
1304         mulq    %r14
1305         addq    %rax,%r10
1306         movq    %rbx,%rax
1307         movq    16(%rsi,%rcx,1),%rbx
1308         movq    %rdx,%r11
1309         adcq    $0,%r11
1310         addq    %r12,%r10
1311         adcq    $0,%r11
1312
1313         mulq    %r15
1314         addq    %rax,%r13
1315         movq    %rbx,%rax
1316         movq    %r10,8(%rdi,%rcx,1)
1317         movq    %rdx,%r12
1318         adcq    $0,%r12
1319
1320         mulq    %r14
1321         addq    %rax,%r11
1322         movq    %rbx,%rax
1323         movq    24(%rsi,%rcx,1),%rbx
1324         movq    %rdx,%r10
1325         adcq    $0,%r10
1326         addq    %r13,%r11
1327         adcq    $0,%r10
1328
1329
1330         mulq    %r15
1331         addq    %rax,%r12
1332         movq    %rbx,%rax
1333         movq    %r11,16(%rdi,%rcx,1)
1334         movq    %rdx,%r13
1335         adcq    $0,%r13
1336         leaq    32(%rcx),%rcx
1337
1338         mulq    %r14
1339         addq    %rax,%r10
1340         movq    %rbx,%rax
1341         movq    %rdx,%r11
1342         adcq    $0,%r11
1343         addq    %r12,%r10
1344         adcq    $0,%r11
1345         movq    %r10,-8(%rdi,%rcx,1)
1346
1347         cmpq    $0,%rcx
1348         jne     .Lsqr4x_1st
1349
1350         mulq    %r15
1351         addq    %rax,%r13
1352         leaq    16(%rbp),%rbp
1353         adcq    $0,%rdx
1354         addq    %r11,%r13
1355         adcq    $0,%rdx
1356
1357         movq    %r13,(%rdi)
1358         movq    %rdx,%r12
1359         movq    %rdx,8(%rdi)
1360         jmp     .Lsqr4x_outer
1361
1362 .align  32
1363 .Lsqr4x_outer:
1364         movq    -32(%rsi,%rbp,1),%r14
1365         leaq    48+8(%rsp,%r9,2),%rdi
1366         movq    -24(%rsi,%rbp,1),%rax
1367         leaq    -32(%rdi,%rbp,1),%rdi
1368         movq    -16(%rsi,%rbp,1),%rbx
1369         movq    %rax,%r15
1370
1371         mulq    %r14
1372         movq    -24(%rdi,%rbp,1),%r10
1373         addq    %rax,%r10
1374         movq    %rbx,%rax
1375         adcq    $0,%rdx
1376         movq    %r10,-24(%rdi,%rbp,1)
1377         movq    %rdx,%r11
1378
1379         mulq    %r14
1380         addq    %rax,%r11
1381         movq    %rbx,%rax
1382         adcq    $0,%rdx
1383         addq    -16(%rdi,%rbp,1),%r11
1384         movq    %rdx,%r10
1385         adcq    $0,%r10
1386         movq    %r11,-16(%rdi,%rbp,1)
1387
1388         xorq    %r12,%r12
1389
1390         movq    -8(%rsi,%rbp,1),%rbx
1391         mulq    %r15
1392         addq    %rax,%r12
1393         movq    %rbx,%rax
1394         adcq    $0,%rdx
1395         addq    -8(%rdi,%rbp,1),%r12
1396         movq    %rdx,%r13
1397         adcq    $0,%r13
1398
1399         mulq    %r14
1400         addq    %rax,%r10
1401         movq    %rbx,%rax
1402         adcq    $0,%rdx
1403         addq    %r12,%r10
1404         movq    %rdx,%r11
1405         adcq    $0,%r11
1406         movq    %r10,-8(%rdi,%rbp,1)
1407
1408         leaq    (%rbp),%rcx
1409         jmp     .Lsqr4x_inner
1410
1411 .align  32
1412 .Lsqr4x_inner:
1413         movq    (%rsi,%rcx,1),%rbx
1414         mulq    %r15
1415         addq    %rax,%r13
1416         movq    %rbx,%rax
1417         movq    %rdx,%r12
1418         adcq    $0,%r12
1419         addq    (%rdi,%rcx,1),%r13
1420         adcq    $0,%r12
1421
1422 .byte   0x67
1423         mulq    %r14
1424         addq    %rax,%r11
1425         movq    %rbx,%rax
1426         movq    8(%rsi,%rcx,1),%rbx
1427         movq    %rdx,%r10
1428         adcq    $0,%r10
1429         addq    %r13,%r11
1430         adcq    $0,%r10
1431
1432         mulq    %r15
1433         addq    %rax,%r12
1434         movq    %r11,(%rdi,%rcx,1)
1435         movq    %rbx,%rax
1436         movq    %rdx,%r13
1437         adcq    $0,%r13
1438         addq    8(%rdi,%rcx,1),%r12
1439         leaq    16(%rcx),%rcx
1440         adcq    $0,%r13
1441
1442         mulq    %r14
1443         addq    %rax,%r10
1444         movq    %rbx,%rax
1445         adcq    $0,%rdx
1446         addq    %r12,%r10
1447         movq    %rdx,%r11
1448         adcq    $0,%r11
1449         movq    %r10,-8(%rdi,%rcx,1)
1450
1451         cmpq    $0,%rcx
1452         jne     .Lsqr4x_inner
1453
1454 .byte   0x67
1455         mulq    %r15
1456         addq    %rax,%r13
1457         adcq    $0,%rdx
1458         addq    %r11,%r13
1459         adcq    $0,%rdx
1460
1461         movq    %r13,(%rdi)
1462         movq    %rdx,%r12
1463         movq    %rdx,8(%rdi)
1464
1465         addq    $16,%rbp
1466         jnz     .Lsqr4x_outer
1467
1468
1469         movq    -32(%rsi),%r14
1470         leaq    48+8(%rsp,%r9,2),%rdi
1471         movq    -24(%rsi),%rax
1472         leaq    -32(%rdi,%rbp,1),%rdi
1473         movq    -16(%rsi),%rbx
1474         movq    %rax,%r15
1475
1476         mulq    %r14
1477         addq    %rax,%r10
1478         movq    %rbx,%rax
1479         movq    %rdx,%r11
1480         adcq    $0,%r11
1481
1482         mulq    %r14
1483         addq    %rax,%r11
1484         movq    %rbx,%rax
1485         movq    %r10,-24(%rdi)
1486         movq    %rdx,%r10
1487         adcq    $0,%r10
1488         addq    %r13,%r11
1489         movq    -8(%rsi),%rbx
1490         adcq    $0,%r10
1491
1492         mulq    %r15
1493         addq    %rax,%r12
1494         movq    %rbx,%rax
1495         movq    %r11,-16(%rdi)
1496         movq    %rdx,%r13
1497         adcq    $0,%r13
1498
1499         mulq    %r14
1500         addq    %rax,%r10
1501         movq    %rbx,%rax
1502         movq    %rdx,%r11
1503         adcq    $0,%r11
1504         addq    %r12,%r10
1505         adcq    $0,%r11
1506         movq    %r10,-8(%rdi)
1507
1508         mulq    %r15
1509         addq    %rax,%r13
1510         movq    -16(%rsi),%rax
1511         adcq    $0,%rdx
1512         addq    %r11,%r13
1513         adcq    $0,%rdx
1514
1515         movq    %r13,(%rdi)
1516         movq    %rdx,%r12
1517         movq    %rdx,8(%rdi)
1518
1519         mulq    %rbx
1520         addq    $16,%rbp
1521         xorq    %r14,%r14
1522         subq    %r9,%rbp
1523         xorq    %r15,%r15
1524
1525         addq    %r12,%rax
1526         adcq    $0,%rdx
1527         movq    %rax,8(%rdi)
1528         movq    %rdx,16(%rdi)
1529         movq    %r15,24(%rdi)
1530
1531         movq    -16(%rsi,%rbp,1),%rax
1532         leaq    48+8(%rsp),%rdi
1533         xorq    %r10,%r10
1534         movq    8(%rdi),%r11
1535
1536         leaq    (%r14,%r10,2),%r12
1537         shrq    $63,%r10
1538         leaq    (%rcx,%r11,2),%r13
1539         shrq    $63,%r11
1540         orq     %r10,%r13
1541         movq    16(%rdi),%r10
1542         movq    %r11,%r14
1543         mulq    %rax
1544         negq    %r15
1545         movq    24(%rdi),%r11
1546         adcq    %rax,%r12
1547         movq    -8(%rsi,%rbp,1),%rax
1548         movq    %r12,(%rdi)
1549         adcq    %rdx,%r13
1550
1551         leaq    (%r14,%r10,2),%rbx
1552         movq    %r13,8(%rdi)
1553         sbbq    %r15,%r15
1554         shrq    $63,%r10
1555         leaq    (%rcx,%r11,2),%r8
1556         shrq    $63,%r11
1557         orq     %r10,%r8
1558         movq    32(%rdi),%r10
1559         movq    %r11,%r14
1560         mulq    %rax
1561         negq    %r15
1562         movq    40(%rdi),%r11
1563         adcq    %rax,%rbx
1564         movq    0(%rsi,%rbp,1),%rax
1565         movq    %rbx,16(%rdi)
1566         adcq    %rdx,%r8
1567         leaq    16(%rbp),%rbp
1568         movq    %r8,24(%rdi)
1569         sbbq    %r15,%r15
1570         leaq    64(%rdi),%rdi
1571         jmp     .Lsqr4x_shift_n_add
1572
1573 .align  32
1574 .Lsqr4x_shift_n_add:
1575         leaq    (%r14,%r10,2),%r12
1576         shrq    $63,%r10
1577         leaq    (%rcx,%r11,2),%r13
1578         shrq    $63,%r11
1579         orq     %r10,%r13
1580         movq    -16(%rdi),%r10
1581         movq    %r11,%r14
1582         mulq    %rax
1583         negq    %r15
1584         movq    -8(%rdi),%r11
1585         adcq    %rax,%r12
1586         movq    -8(%rsi,%rbp,1),%rax
1587         movq    %r12,-32(%rdi)
1588         adcq    %rdx,%r13
1589
1590         leaq    (%r14,%r10,2),%rbx
1591         movq    %r13,-24(%rdi)
1592         sbbq    %r15,%r15
1593         shrq    $63,%r10
1594         leaq    (%rcx,%r11,2),%r8
1595         shrq    $63,%r11
1596         orq     %r10,%r8
1597         movq    0(%rdi),%r10
1598         movq    %r11,%r14
1599         mulq    %rax
1600         negq    %r15
1601         movq    8(%rdi),%r11
1602         adcq    %rax,%rbx
1603         movq    0(%rsi,%rbp,1),%rax
1604         movq    %rbx,-16(%rdi)
1605         adcq    %rdx,%r8
1606
1607         leaq    (%r14,%r10,2),%r12
1608         movq    %r8,-8(%rdi)
1609         sbbq    %r15,%r15
1610         shrq    $63,%r10
1611         leaq    (%rcx,%r11,2),%r13
1612         shrq    $63,%r11
1613         orq     %r10,%r13
1614         movq    16(%rdi),%r10
1615         movq    %r11,%r14
1616         mulq    %rax
1617         negq    %r15
1618         movq    24(%rdi),%r11
1619         adcq    %rax,%r12
1620         movq    8(%rsi,%rbp,1),%rax
1621         movq    %r12,0(%rdi)
1622         adcq    %rdx,%r13
1623
1624         leaq    (%r14,%r10,2),%rbx
1625         movq    %r13,8(%rdi)
1626         sbbq    %r15,%r15
1627         shrq    $63,%r10
1628         leaq    (%rcx,%r11,2),%r8
1629         shrq    $63,%r11
1630         orq     %r10,%r8
1631         movq    32(%rdi),%r10
1632         movq    %r11,%r14
1633         mulq    %rax
1634         negq    %r15
1635         movq    40(%rdi),%r11
1636         adcq    %rax,%rbx
1637         movq    16(%rsi,%rbp,1),%rax
1638         movq    %rbx,16(%rdi)
1639         adcq    %rdx,%r8
1640         movq    %r8,24(%rdi)
1641         sbbq    %r15,%r15
1642         leaq    64(%rdi),%rdi
1643         addq    $32,%rbp
1644         jnz     .Lsqr4x_shift_n_add
1645
1646         leaq    (%r14,%r10,2),%r12
1647 .byte   0x67
1648         shrq    $63,%r10
1649         leaq    (%rcx,%r11,2),%r13
1650         shrq    $63,%r11
1651         orq     %r10,%r13
1652         movq    -16(%rdi),%r10
1653         movq    %r11,%r14
1654         mulq    %rax
1655         negq    %r15
1656         movq    -8(%rdi),%r11
1657         adcq    %rax,%r12
1658         movq    -8(%rsi),%rax
1659         movq    %r12,-32(%rdi)
1660         adcq    %rdx,%r13
1661
1662         leaq    (%r14,%r10,2),%rbx
1663         movq    %r13,-24(%rdi)
1664         sbbq    %r15,%r15
1665         shrq    $63,%r10
1666         leaq    (%rcx,%r11,2),%r8
1667         shrq    $63,%r11
1668         orq     %r10,%r8
1669         mulq    %rax
1670         negq    %r15
1671         adcq    %rax,%rbx
1672         adcq    %rdx,%r8
1673         movq    %rbx,-16(%rdi)
1674         movq    %r8,-8(%rdi)
1675 .byte   102,72,15,126,213
1676 __bn_sqr8x_reduction:
1677         xorq    %rax,%rax
1678         leaq    (%r9,%rbp,1),%rcx
1679         leaq    48+8(%rsp,%r9,2),%rdx
1680         movq    %rcx,0+8(%rsp)
1681         leaq    48+8(%rsp,%r9,1),%rdi
1682         movq    %rdx,8+8(%rsp)
1683         negq    %r9
1684         jmp     .L8x_reduction_loop
1685
1686 .align  32
1687 .L8x_reduction_loop:
1688         leaq    (%rdi,%r9,1),%rdi
1689 .byte   0x66
1690         movq    0(%rdi),%rbx
1691         movq    8(%rdi),%r9
1692         movq    16(%rdi),%r10
1693         movq    24(%rdi),%r11
1694         movq    32(%rdi),%r12
1695         movq    40(%rdi),%r13
1696         movq    48(%rdi),%r14
1697         movq    56(%rdi),%r15
1698         movq    %rax,(%rdx)
1699         leaq    64(%rdi),%rdi
1700
1701 .byte   0x67
1702         movq    %rbx,%r8
1703         imulq   32+8(%rsp),%rbx
1704         movq    0(%rbp),%rax
1705         movl    $8,%ecx
1706         jmp     .L8x_reduce
1707
1708 .align  32
1709 .L8x_reduce:
1710         mulq    %rbx
1711         movq    8(%rbp),%rax
1712         negq    %r8
1713         movq    %rdx,%r8
1714         adcq    $0,%r8
1715
1716         mulq    %rbx
1717         addq    %rax,%r9
1718         movq    16(%rbp),%rax
1719         adcq    $0,%rdx
1720         addq    %r9,%r8
1721         movq    %rbx,48-8+8(%rsp,%rcx,8)
1722         movq    %rdx,%r9
1723         adcq    $0,%r9
1724
1725         mulq    %rbx
1726         addq    %rax,%r10
1727         movq    24(%rbp),%rax
1728         adcq    $0,%rdx
1729         addq    %r10,%r9
1730         movq    32+8(%rsp),%rsi
1731         movq    %rdx,%r10
1732         adcq    $0,%r10
1733
1734         mulq    %rbx
1735         addq    %rax,%r11
1736         movq    32(%rbp),%rax
1737         adcq    $0,%rdx
1738         imulq   %r8,%rsi
1739         addq    %r11,%r10
1740         movq    %rdx,%r11
1741         adcq    $0,%r11
1742
1743         mulq    %rbx
1744         addq    %rax,%r12
1745         movq    40(%rbp),%rax
1746         adcq    $0,%rdx
1747         addq    %r12,%r11
1748         movq    %rdx,%r12
1749         adcq    $0,%r12
1750
1751         mulq    %rbx
1752         addq    %rax,%r13
1753         movq    48(%rbp),%rax
1754         adcq    $0,%rdx
1755         addq    %r13,%r12
1756         movq    %rdx,%r13
1757         adcq    $0,%r13
1758
1759         mulq    %rbx
1760         addq    %rax,%r14
1761         movq    56(%rbp),%rax
1762         adcq    $0,%rdx
1763         addq    %r14,%r13
1764         movq    %rdx,%r14
1765         adcq    $0,%r14
1766
1767         mulq    %rbx
1768         movq    %rsi,%rbx
1769         addq    %rax,%r15
1770         movq    0(%rbp),%rax
1771         adcq    $0,%rdx
1772         addq    %r15,%r14
1773         movq    %rdx,%r15
1774         adcq    $0,%r15
1775
1776         decl    %ecx
1777         jnz     .L8x_reduce
1778
1779         leaq    64(%rbp),%rbp
1780         xorq    %rax,%rax
1781         movq    8+8(%rsp),%rdx
1782         cmpq    0+8(%rsp),%rbp
1783         jae     .L8x_no_tail
1784
1785 .byte   0x66
1786         addq    0(%rdi),%r8
1787         adcq    8(%rdi),%r9
1788         adcq    16(%rdi),%r10
1789         adcq    24(%rdi),%r11
1790         adcq    32(%rdi),%r12
1791         adcq    40(%rdi),%r13
1792         adcq    48(%rdi),%r14
1793         adcq    56(%rdi),%r15
1794         sbbq    %rsi,%rsi
1795
1796         movq    48+56+8(%rsp),%rbx
1797         movl    $8,%ecx
1798         movq    0(%rbp),%rax
1799         jmp     .L8x_tail
1800
1801 .align  32
1802 .L8x_tail:
1803         mulq    %rbx
1804         addq    %rax,%r8
1805         movq    8(%rbp),%rax
1806         movq    %r8,(%rdi)
1807         movq    %rdx,%r8
1808         adcq    $0,%r8
1809
1810         mulq    %rbx
1811         addq    %rax,%r9
1812         movq    16(%rbp),%rax
1813         adcq    $0,%rdx
1814         addq    %r9,%r8
1815         leaq    8(%rdi),%rdi
1816         movq    %rdx,%r9
1817         adcq    $0,%r9
1818
1819         mulq    %rbx
1820         addq    %rax,%r10
1821         movq    24(%rbp),%rax
1822         adcq    $0,%rdx
1823         addq    %r10,%r9
1824         movq    %rdx,%r10
1825         adcq    $0,%r10
1826
1827         mulq    %rbx
1828         addq    %rax,%r11
1829         movq    32(%rbp),%rax
1830         adcq    $0,%rdx
1831         addq    %r11,%r10
1832         movq    %rdx,%r11
1833         adcq    $0,%r11
1834
1835         mulq    %rbx
1836         addq    %rax,%r12
1837         movq    40(%rbp),%rax
1838         adcq    $0,%rdx
1839         addq    %r12,%r11
1840         movq    %rdx,%r12
1841         adcq    $0,%r12
1842
1843         mulq    %rbx
1844         addq    %rax,%r13
1845         movq    48(%rbp),%rax
1846         adcq    $0,%rdx
1847         addq    %r13,%r12
1848         movq    %rdx,%r13
1849         adcq    $0,%r13
1850
1851         mulq    %rbx
1852         addq    %rax,%r14
1853         movq    56(%rbp),%rax
1854         adcq    $0,%rdx
1855         addq    %r14,%r13
1856         movq    %rdx,%r14
1857         adcq    $0,%r14
1858
1859         mulq    %rbx
1860         movq    48-16+8(%rsp,%rcx,8),%rbx
1861         addq    %rax,%r15
1862         adcq    $0,%rdx
1863         addq    %r15,%r14
1864         movq    0(%rbp),%rax
1865         movq    %rdx,%r15
1866         adcq    $0,%r15
1867
1868         decl    %ecx
1869         jnz     .L8x_tail
1870
1871         leaq    64(%rbp),%rbp
1872         movq    8+8(%rsp),%rdx
1873         cmpq    0+8(%rsp),%rbp
1874         jae     .L8x_tail_done
1875
1876         movq    48+56+8(%rsp),%rbx
1877         negq    %rsi
1878         movq    0(%rbp),%rax
1879         adcq    0(%rdi),%r8
1880         adcq    8(%rdi),%r9
1881         adcq    16(%rdi),%r10
1882         adcq    24(%rdi),%r11
1883         adcq    32(%rdi),%r12
1884         adcq    40(%rdi),%r13
1885         adcq    48(%rdi),%r14
1886         adcq    56(%rdi),%r15
1887         sbbq    %rsi,%rsi
1888
1889         movl    $8,%ecx
1890         jmp     .L8x_tail
1891
1892 .align  32
1893 .L8x_tail_done:
1894         xorq    %rax,%rax
1895         addq    (%rdx),%r8
1896         adcq    $0,%r9
1897         adcq    $0,%r10
1898         adcq    $0,%r11
1899         adcq    $0,%r12
1900         adcq    $0,%r13
1901         adcq    $0,%r14
1902         adcq    $0,%r15
1903         adcq    $0,%rax
1904
1905         negq    %rsi
1906 .L8x_no_tail:
1907         adcq    0(%rdi),%r8
1908         adcq    8(%rdi),%r9
1909         adcq    16(%rdi),%r10
1910         adcq    24(%rdi),%r11
1911         adcq    32(%rdi),%r12
1912         adcq    40(%rdi),%r13
1913         adcq    48(%rdi),%r14
1914         adcq    56(%rdi),%r15
1915         adcq    $0,%rax
1916         movq    -8(%rbp),%rcx
1917         xorq    %rsi,%rsi
1918
1919 .byte   102,72,15,126,213
1920
1921         movq    %r8,0(%rdi)
1922         movq    %r9,8(%rdi)
1923 .byte   102,73,15,126,217
1924         movq    %r10,16(%rdi)
1925         movq    %r11,24(%rdi)
1926         movq    %r12,32(%rdi)
1927         movq    %r13,40(%rdi)
1928         movq    %r14,48(%rdi)
1929         movq    %r15,56(%rdi)
1930         leaq    64(%rdi),%rdi
1931
1932         cmpq    %rdx,%rdi
1933         jb      .L8x_reduction_loop
1934         .byte   0xf3,0xc3
1935 .size   bn_sqr8x_internal,.-bn_sqr8x_internal
1936 .type   __bn_post4x_internal,@function
1937 .align  32
1938 __bn_post4x_internal:
1939         movq    0(%rbp),%r12
1940         leaq    (%rdi,%r9,1),%rbx
1941         movq    %r9,%rcx
1942 .byte   102,72,15,126,207
1943         negq    %rax
1944 .byte   102,72,15,126,206
1945         sarq    $3+2,%rcx
1946         decq    %r12
1947         xorq    %r10,%r10
1948         movq    8(%rbp),%r13
1949         movq    16(%rbp),%r14
1950         movq    24(%rbp),%r15
1951         jmp     .Lsqr4x_sub_entry
1952
1953 .align  16
1954 .Lsqr4x_sub:
1955         movq    0(%rbp),%r12
1956         movq    8(%rbp),%r13
1957         movq    16(%rbp),%r14
1958         movq    24(%rbp),%r15
1959 .Lsqr4x_sub_entry:
1960         leaq    32(%rbp),%rbp
1961         notq    %r12
1962         notq    %r13
1963         notq    %r14
1964         notq    %r15
1965         andq    %rax,%r12
1966         andq    %rax,%r13
1967         andq    %rax,%r14
1968         andq    %rax,%r15
1969
1970         negq    %r10
1971         adcq    0(%rbx),%r12
1972         adcq    8(%rbx),%r13
1973         adcq    16(%rbx),%r14
1974         adcq    24(%rbx),%r15
1975         movq    %r12,0(%rdi)
1976         leaq    32(%rbx),%rbx
1977         movq    %r13,8(%rdi)
1978         sbbq    %r10,%r10
1979         movq    %r14,16(%rdi)
1980         movq    %r15,24(%rdi)
1981         leaq    32(%rdi),%rdi
1982
1983         incq    %rcx
1984         jnz     .Lsqr4x_sub
1985
1986         movq    %r9,%r10
1987         negq    %r9
1988         .byte   0xf3,0xc3
1989 .size   __bn_post4x_internal,.-__bn_post4x_internal
1990 .globl  bn_from_montgomery
1991 .type   bn_from_montgomery,@function
1992 .align  32
1993 bn_from_montgomery:
1994         testl   $7,%r9d
1995         jz      bn_from_mont8x
1996         xorl    %eax,%eax
1997         .byte   0xf3,0xc3
1998 .size   bn_from_montgomery,.-bn_from_montgomery
1999
2000 .type   bn_from_mont8x,@function
2001 .align  32
2002 bn_from_mont8x:
2003 .byte   0x67
2004         movq    %rsp,%rax
2005         pushq   %rbx
2006         pushq   %rbp
2007         pushq   %r12
2008         pushq   %r13
2009         pushq   %r14
2010         pushq   %r15
2011 .Lfrom_prologue:
2012
2013         shll    $3,%r9d
2014         leaq    (%r9,%r9,2),%r10
2015         negq    %r9
2016         movq    (%r8),%r8
2017
2018
2019
2020
2021
2022
2023
2024
2025         leaq    -320(%rsp,%r9,2),%r11
2026         movq    %rsp,%rbp
2027         subq    %rdi,%r11
2028         andq    $4095,%r11
2029         cmpq    %r11,%r10
2030         jb      .Lfrom_sp_alt
2031         subq    %r11,%rbp
2032         leaq    -320(%rbp,%r9,2),%rbp
2033         jmp     .Lfrom_sp_done
2034
2035 .align  32
2036 .Lfrom_sp_alt:
2037         leaq    4096-320(,%r9,2),%r10
2038         leaq    -320(%rbp,%r9,2),%rbp
2039         subq    %r10,%r11
2040         movq    $0,%r10
2041         cmovcq  %r10,%r11
2042         subq    %r11,%rbp
2043 .Lfrom_sp_done:
2044         andq    $-64,%rbp
2045         movq    %rsp,%r11
2046         subq    %rbp,%r11
2047         andq    $-4096,%r11
2048         leaq    (%r11,%rbp,1),%rsp
2049         movq    (%rsp),%r10
2050         cmpq    %rbp,%rsp
2051         ja      .Lfrom_page_walk
2052         jmp     .Lfrom_page_walk_done
2053
2054 .Lfrom_page_walk:
2055         leaq    -4096(%rsp),%rsp
2056         movq    (%rsp),%r10
2057         cmpq    %rbp,%rsp
2058         ja      .Lfrom_page_walk
2059 .Lfrom_page_walk_done:
2060
2061         movq    %r9,%r10
2062         negq    %r9
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073         movq    %r8,32(%rsp)
2074         movq    %rax,40(%rsp)
2075 .Lfrom_body:
2076         movq    %r9,%r11
2077         leaq    48(%rsp),%rax
2078         pxor    %xmm0,%xmm0
2079         jmp     .Lmul_by_1
2080
2081 .align  32
2082 .Lmul_by_1:
2083         movdqu  (%rsi),%xmm1
2084         movdqu  16(%rsi),%xmm2
2085         movdqu  32(%rsi),%xmm3
2086         movdqa  %xmm0,(%rax,%r9,1)
2087         movdqu  48(%rsi),%xmm4
2088         movdqa  %xmm0,16(%rax,%r9,1)
2089 .byte   0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2090         movdqa  %xmm1,(%rax)
2091         movdqa  %xmm0,32(%rax,%r9,1)
2092         movdqa  %xmm2,16(%rax)
2093         movdqa  %xmm0,48(%rax,%r9,1)
2094         movdqa  %xmm3,32(%rax)
2095         movdqa  %xmm4,48(%rax)
2096         leaq    64(%rax),%rax
2097         subq    $64,%r11
2098         jnz     .Lmul_by_1
2099
2100 .byte   102,72,15,110,207
2101 .byte   102,72,15,110,209
2102 .byte   0x67
2103         movq    %rcx,%rbp
2104 .byte   102,73,15,110,218
2105         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
2106         andl    $0x80108,%r11d
2107         cmpl    $0x80108,%r11d
2108         jne     .Lfrom_mont_nox
2109
2110         leaq    (%rax,%r9,1),%rdi
2111         call    __bn_sqrx8x_reduction
2112         call    __bn_postx4x_internal
2113
2114         pxor    %xmm0,%xmm0
2115         leaq    48(%rsp),%rax
2116         movq    40(%rsp),%rsi
2117         jmp     .Lfrom_mont_zero
2118
2119 .align  32
2120 .Lfrom_mont_nox:
2121         call    __bn_sqr8x_reduction
2122         call    __bn_post4x_internal
2123
2124         pxor    %xmm0,%xmm0
2125         leaq    48(%rsp),%rax
2126         movq    40(%rsp),%rsi
2127         jmp     .Lfrom_mont_zero
2128
2129 .align  32
2130 .Lfrom_mont_zero:
2131         movdqa  %xmm0,0(%rax)
2132         movdqa  %xmm0,16(%rax)
2133         movdqa  %xmm0,32(%rax)
2134         movdqa  %xmm0,48(%rax)
2135         leaq    64(%rax),%rax
2136         subq    $32,%r9
2137         jnz     .Lfrom_mont_zero
2138
2139         movq    $1,%rax
2140         movq    -48(%rsi),%r15
2141         movq    -40(%rsi),%r14
2142         movq    -32(%rsi),%r13
2143         movq    -24(%rsi),%r12
2144         movq    -16(%rsi),%rbp
2145         movq    -8(%rsi),%rbx
2146         leaq    (%rsi),%rsp
2147 .Lfrom_epilogue:
2148         .byte   0xf3,0xc3
2149 .size   bn_from_mont8x,.-bn_from_mont8x
2150 .type   bn_mulx4x_mont_gather5,@function
2151 .align  32
2152 bn_mulx4x_mont_gather5:
2153         movq    %rsp,%rax
2154 .Lmulx4x_enter:
2155         pushq   %rbx
2156         pushq   %rbp
2157         pushq   %r12
2158         pushq   %r13
2159         pushq   %r14
2160         pushq   %r15
2161 .Lmulx4x_prologue:
2162
2163         shll    $3,%r9d
2164         leaq    (%r9,%r9,2),%r10
2165         negq    %r9
2166         movq    (%r8),%r8
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177         leaq    -320(%rsp,%r9,2),%r11
2178         movq    %rsp,%rbp
2179         subq    %rdi,%r11
2180         andq    $4095,%r11
2181         cmpq    %r11,%r10
2182         jb      .Lmulx4xsp_alt
2183         subq    %r11,%rbp
2184         leaq    -320(%rbp,%r9,2),%rbp
2185         jmp     .Lmulx4xsp_done
2186
2187 .Lmulx4xsp_alt:
2188         leaq    4096-320(,%r9,2),%r10
2189         leaq    -320(%rbp,%r9,2),%rbp
2190         subq    %r10,%r11
2191         movq    $0,%r10
2192         cmovcq  %r10,%r11
2193         subq    %r11,%rbp
2194 .Lmulx4xsp_done:
2195         andq    $-64,%rbp
2196         movq    %rsp,%r11
2197         subq    %rbp,%r11
2198         andq    $-4096,%r11
2199         leaq    (%r11,%rbp,1),%rsp
2200         movq    (%rsp),%r10
2201         cmpq    %rbp,%rsp
2202         ja      .Lmulx4x_page_walk
2203         jmp     .Lmulx4x_page_walk_done
2204
2205 .Lmulx4x_page_walk:
2206         leaq    -4096(%rsp),%rsp
2207         movq    (%rsp),%r10
2208         cmpq    %rbp,%rsp
2209         ja      .Lmulx4x_page_walk
2210 .Lmulx4x_page_walk_done:
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224         movq    %r8,32(%rsp)
2225         movq    %rax,40(%rsp)
2226 .Lmulx4x_body:
2227         call    mulx4x_internal
2228
2229         movq    40(%rsp),%rsi
2230         movq    $1,%rax
2231
2232         movq    -48(%rsi),%r15
2233         movq    -40(%rsi),%r14
2234         movq    -32(%rsi),%r13
2235         movq    -24(%rsi),%r12
2236         movq    -16(%rsi),%rbp
2237         movq    -8(%rsi),%rbx
2238         leaq    (%rsi),%rsp
2239 .Lmulx4x_epilogue:
2240         .byte   0xf3,0xc3
2241 .size   bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2242
2243 .type   mulx4x_internal,@function
2244 .align  32
2245 mulx4x_internal:
2246         movq    %r9,8(%rsp)
2247         movq    %r9,%r10
2248         negq    %r9
2249         shlq    $5,%r9
2250         negq    %r10
2251         leaq    128(%rdx,%r9,1),%r13
2252         shrq    $5+5,%r9
2253         movd    8(%rax),%xmm5
2254         subq    $1,%r9
2255         leaq    .Linc(%rip),%rax
2256         movq    %r13,16+8(%rsp)
2257         movq    %r9,24+8(%rsp)
2258         movq    %rdi,56+8(%rsp)
2259         movdqa  0(%rax),%xmm0
2260         movdqa  16(%rax),%xmm1
2261         leaq    88-112(%rsp,%r10,1),%r10
2262         leaq    128(%rdx),%rdi
2263
2264         pshufd  $0,%xmm5,%xmm5
2265         movdqa  %xmm1,%xmm4
2266 .byte   0x67
2267         movdqa  %xmm1,%xmm2
2268 .byte   0x67
2269         paddd   %xmm0,%xmm1
2270         pcmpeqd %xmm5,%xmm0
2271         movdqa  %xmm4,%xmm3
2272         paddd   %xmm1,%xmm2
2273         pcmpeqd %xmm5,%xmm1
2274         movdqa  %xmm0,112(%r10)
2275         movdqa  %xmm4,%xmm0
2276
2277         paddd   %xmm2,%xmm3
2278         pcmpeqd %xmm5,%xmm2
2279         movdqa  %xmm1,128(%r10)
2280         movdqa  %xmm4,%xmm1
2281
2282         paddd   %xmm3,%xmm0
2283         pcmpeqd %xmm5,%xmm3
2284         movdqa  %xmm2,144(%r10)
2285         movdqa  %xmm4,%xmm2
2286
2287         paddd   %xmm0,%xmm1
2288         pcmpeqd %xmm5,%xmm0
2289         movdqa  %xmm3,160(%r10)
2290         movdqa  %xmm4,%xmm3
2291         paddd   %xmm1,%xmm2
2292         pcmpeqd %xmm5,%xmm1
2293         movdqa  %xmm0,176(%r10)
2294         movdqa  %xmm4,%xmm0
2295
2296         paddd   %xmm2,%xmm3
2297         pcmpeqd %xmm5,%xmm2
2298         movdqa  %xmm1,192(%r10)
2299         movdqa  %xmm4,%xmm1
2300
2301         paddd   %xmm3,%xmm0
2302         pcmpeqd %xmm5,%xmm3
2303         movdqa  %xmm2,208(%r10)
2304         movdqa  %xmm4,%xmm2
2305
2306         paddd   %xmm0,%xmm1
2307         pcmpeqd %xmm5,%xmm0
2308         movdqa  %xmm3,224(%r10)
2309         movdqa  %xmm4,%xmm3
2310         paddd   %xmm1,%xmm2
2311         pcmpeqd %xmm5,%xmm1
2312         movdqa  %xmm0,240(%r10)
2313         movdqa  %xmm4,%xmm0
2314
2315         paddd   %xmm2,%xmm3
2316         pcmpeqd %xmm5,%xmm2
2317         movdqa  %xmm1,256(%r10)
2318         movdqa  %xmm4,%xmm1
2319
2320         paddd   %xmm3,%xmm0
2321         pcmpeqd %xmm5,%xmm3
2322         movdqa  %xmm2,272(%r10)
2323         movdqa  %xmm4,%xmm2
2324
2325         paddd   %xmm0,%xmm1
2326         pcmpeqd %xmm5,%xmm0
2327         movdqa  %xmm3,288(%r10)
2328         movdqa  %xmm4,%xmm3
2329 .byte   0x67
2330         paddd   %xmm1,%xmm2
2331         pcmpeqd %xmm5,%xmm1
2332         movdqa  %xmm0,304(%r10)
2333
2334         paddd   %xmm2,%xmm3
2335         pcmpeqd %xmm5,%xmm2
2336         movdqa  %xmm1,320(%r10)
2337
2338         pcmpeqd %xmm5,%xmm3
2339         movdqa  %xmm2,336(%r10)
2340
2341         pand    64(%rdi),%xmm0
2342         pand    80(%rdi),%xmm1
2343         pand    96(%rdi),%xmm2
2344         movdqa  %xmm3,352(%r10)
2345         pand    112(%rdi),%xmm3
2346         por     %xmm2,%xmm0
2347         por     %xmm3,%xmm1
2348         movdqa  -128(%rdi),%xmm4
2349         movdqa  -112(%rdi),%xmm5
2350         movdqa  -96(%rdi),%xmm2
2351         pand    112(%r10),%xmm4
2352         movdqa  -80(%rdi),%xmm3
2353         pand    128(%r10),%xmm5
2354         por     %xmm4,%xmm0
2355         pand    144(%r10),%xmm2
2356         por     %xmm5,%xmm1
2357         pand    160(%r10),%xmm3
2358         por     %xmm2,%xmm0
2359         por     %xmm3,%xmm1
2360         movdqa  -64(%rdi),%xmm4
2361         movdqa  -48(%rdi),%xmm5
2362         movdqa  -32(%rdi),%xmm2
2363         pand    176(%r10),%xmm4
2364         movdqa  -16(%rdi),%xmm3
2365         pand    192(%r10),%xmm5
2366         por     %xmm4,%xmm0
2367         pand    208(%r10),%xmm2
2368         por     %xmm5,%xmm1
2369         pand    224(%r10),%xmm3
2370         por     %xmm2,%xmm0
2371         por     %xmm3,%xmm1
2372         movdqa  0(%rdi),%xmm4
2373         movdqa  16(%rdi),%xmm5
2374         movdqa  32(%rdi),%xmm2
2375         pand    240(%r10),%xmm4
2376         movdqa  48(%rdi),%xmm3
2377         pand    256(%r10),%xmm5
2378         por     %xmm4,%xmm0
2379         pand    272(%r10),%xmm2
2380         por     %xmm5,%xmm1
2381         pand    288(%r10),%xmm3
2382         por     %xmm2,%xmm0
2383         por     %xmm3,%xmm1
2384         pxor    %xmm1,%xmm0
2385         pshufd  $0x4e,%xmm0,%xmm1
2386         por     %xmm1,%xmm0
2387         leaq    256(%rdi),%rdi
2388 .byte   102,72,15,126,194
2389         leaq    64+32+8(%rsp),%rbx
2390
2391         movq    %rdx,%r9
2392         mulxq   0(%rsi),%r8,%rax
2393         mulxq   8(%rsi),%r11,%r12
2394         addq    %rax,%r11
2395         mulxq   16(%rsi),%rax,%r13
2396         adcq    %rax,%r12
2397         adcq    $0,%r13
2398         mulxq   24(%rsi),%rax,%r14
2399
2400         movq    %r8,%r15
2401         imulq   32+8(%rsp),%r8
2402         xorq    %rbp,%rbp
2403         movq    %r8,%rdx
2404
2405         movq    %rdi,8+8(%rsp)
2406
2407         leaq    32(%rsi),%rsi
2408         adcxq   %rax,%r13
2409         adcxq   %rbp,%r14
2410
2411         mulxq   0(%rcx),%rax,%r10
2412         adcxq   %rax,%r15
2413         adoxq   %r11,%r10
2414         mulxq   8(%rcx),%rax,%r11
2415         adcxq   %rax,%r10
2416         adoxq   %r12,%r11
2417         mulxq   16(%rcx),%rax,%r12
2418         movq    24+8(%rsp),%rdi
2419         movq    %r10,-32(%rbx)
2420         adcxq   %rax,%r11
2421         adoxq   %r13,%r12
2422         mulxq   24(%rcx),%rax,%r15
2423         movq    %r9,%rdx
2424         movq    %r11,-24(%rbx)
2425         adcxq   %rax,%r12
2426         adoxq   %rbp,%r15
2427         leaq    32(%rcx),%rcx
2428         movq    %r12,-16(%rbx)
2429         jmp     .Lmulx4x_1st
2430
2431 .align  32
2432 .Lmulx4x_1st:
2433         adcxq   %rbp,%r15
2434         mulxq   0(%rsi),%r10,%rax
2435         adcxq   %r14,%r10
2436         mulxq   8(%rsi),%r11,%r14
2437         adcxq   %rax,%r11
2438         mulxq   16(%rsi),%r12,%rax
2439         adcxq   %r14,%r12
2440         mulxq   24(%rsi),%r13,%r14
2441 .byte   0x67,0x67
2442         movq    %r8,%rdx
2443         adcxq   %rax,%r13
2444         adcxq   %rbp,%r14
2445         leaq    32(%rsi),%rsi
2446         leaq    32(%rbx),%rbx
2447
2448         adoxq   %r15,%r10
2449         mulxq   0(%rcx),%rax,%r15
2450         adcxq   %rax,%r10
2451         adoxq   %r15,%r11
2452         mulxq   8(%rcx),%rax,%r15
2453         adcxq   %rax,%r11
2454         adoxq   %r15,%r12
2455         mulxq   16(%rcx),%rax,%r15
2456         movq    %r10,-40(%rbx)
2457         adcxq   %rax,%r12
2458         movq    %r11,-32(%rbx)
2459         adoxq   %r15,%r13
2460         mulxq   24(%rcx),%rax,%r15
2461         movq    %r9,%rdx
2462         movq    %r12,-24(%rbx)
2463         adcxq   %rax,%r13
2464         adoxq   %rbp,%r15
2465         leaq    32(%rcx),%rcx
2466         movq    %r13,-16(%rbx)
2467
2468         decq    %rdi
2469         jnz     .Lmulx4x_1st
2470
2471         movq    8(%rsp),%rax
2472         adcq    %rbp,%r15
2473         leaq    (%rsi,%rax,1),%rsi
2474         addq    %r15,%r14
2475         movq    8+8(%rsp),%rdi
2476         adcq    %rbp,%rbp
2477         movq    %r14,-8(%rbx)
2478         jmp     .Lmulx4x_outer
2479
2480 .align  32
2481 .Lmulx4x_outer:
2482         leaq    16-256(%rbx),%r10
2483         pxor    %xmm4,%xmm4
2484 .byte   0x67,0x67
2485         pxor    %xmm5,%xmm5
2486         movdqa  -128(%rdi),%xmm0
2487         movdqa  -112(%rdi),%xmm1
2488         movdqa  -96(%rdi),%xmm2
2489         pand    256(%r10),%xmm0
2490         movdqa  -80(%rdi),%xmm3
2491         pand    272(%r10),%xmm1
2492         por     %xmm0,%xmm4
2493         pand    288(%r10),%xmm2
2494         por     %xmm1,%xmm5
2495         pand    304(%r10),%xmm3
2496         por     %xmm2,%xmm4
2497         por     %xmm3,%xmm5
2498         movdqa  -64(%rdi),%xmm0
2499         movdqa  -48(%rdi),%xmm1
2500         movdqa  -32(%rdi),%xmm2
2501         pand    320(%r10),%xmm0
2502         movdqa  -16(%rdi),%xmm3
2503         pand    336(%r10),%xmm1
2504         por     %xmm0,%xmm4
2505         pand    352(%r10),%xmm2
2506         por     %xmm1,%xmm5
2507         pand    368(%r10),%xmm3
2508         por     %xmm2,%xmm4
2509         por     %xmm3,%xmm5
2510         movdqa  0(%rdi),%xmm0
2511         movdqa  16(%rdi),%xmm1
2512         movdqa  32(%rdi),%xmm2
2513         pand    384(%r10),%xmm0
2514         movdqa  48(%rdi),%xmm3
2515         pand    400(%r10),%xmm1
2516         por     %xmm0,%xmm4
2517         pand    416(%r10),%xmm2
2518         por     %xmm1,%xmm5
2519         pand    432(%r10),%xmm3
2520         por     %xmm2,%xmm4
2521         por     %xmm3,%xmm5
2522         movdqa  64(%rdi),%xmm0
2523         movdqa  80(%rdi),%xmm1
2524         movdqa  96(%rdi),%xmm2
2525         pand    448(%r10),%xmm0
2526         movdqa  112(%rdi),%xmm3
2527         pand    464(%r10),%xmm1
2528         por     %xmm0,%xmm4
2529         pand    480(%r10),%xmm2
2530         por     %xmm1,%xmm5
2531         pand    496(%r10),%xmm3
2532         por     %xmm2,%xmm4
2533         por     %xmm3,%xmm5
2534         por     %xmm5,%xmm4
2535         pshufd  $0x4e,%xmm4,%xmm0
2536         por     %xmm4,%xmm0
2537         leaq    256(%rdi),%rdi
2538 .byte   102,72,15,126,194
2539
2540         movq    %rbp,(%rbx)
2541         leaq    32(%rbx,%rax,1),%rbx
2542         mulxq   0(%rsi),%r8,%r11
2543         xorq    %rbp,%rbp
2544         movq    %rdx,%r9
2545         mulxq   8(%rsi),%r14,%r12
2546         adoxq   -32(%rbx),%r8
2547         adcxq   %r14,%r11
2548         mulxq   16(%rsi),%r15,%r13
2549         adoxq   -24(%rbx),%r11
2550         adcxq   %r15,%r12
2551         mulxq   24(%rsi),%rdx,%r14
2552         adoxq   -16(%rbx),%r12
2553         adcxq   %rdx,%r13
2554         leaq    (%rcx,%rax,1),%rcx
2555         leaq    32(%rsi),%rsi
2556         adoxq   -8(%rbx),%r13
2557         adcxq   %rbp,%r14
2558         adoxq   %rbp,%r14
2559
2560         movq    %r8,%r15
2561         imulq   32+8(%rsp),%r8
2562
2563         movq    %r8,%rdx
2564         xorq    %rbp,%rbp
2565         movq    %rdi,8+8(%rsp)
2566
2567         mulxq   0(%rcx),%rax,%r10
2568         adcxq   %rax,%r15
2569         adoxq   %r11,%r10
2570         mulxq   8(%rcx),%rax,%r11
2571         adcxq   %rax,%r10
2572         adoxq   %r12,%r11
2573         mulxq   16(%rcx),%rax,%r12
2574         adcxq   %rax,%r11
2575         adoxq   %r13,%r12
2576         mulxq   24(%rcx),%rax,%r15
2577         movq    %r9,%rdx
2578         movq    24+8(%rsp),%rdi
2579         movq    %r10,-32(%rbx)
2580         adcxq   %rax,%r12
2581         movq    %r11,-24(%rbx)
2582         adoxq   %rbp,%r15
2583         movq    %r12,-16(%rbx)
2584         leaq    32(%rcx),%rcx
2585         jmp     .Lmulx4x_inner
2586
2587 .align  32
2588 .Lmulx4x_inner:
2589         mulxq   0(%rsi),%r10,%rax
2590         adcxq   %rbp,%r15
2591         adoxq   %r14,%r10
2592         mulxq   8(%rsi),%r11,%r14
2593         adcxq   0(%rbx),%r10
2594         adoxq   %rax,%r11
2595         mulxq   16(%rsi),%r12,%rax
2596         adcxq   8(%rbx),%r11
2597         adoxq   %r14,%r12
2598         mulxq   24(%rsi),%r13,%r14
2599         movq    %r8,%rdx
2600         adcxq   16(%rbx),%r12
2601         adoxq   %rax,%r13
2602         adcxq   24(%rbx),%r13
2603         adoxq   %rbp,%r14
2604         leaq    32(%rsi),%rsi
2605         leaq    32(%rbx),%rbx
2606         adcxq   %rbp,%r14
2607
2608         adoxq   %r15,%r10
2609         mulxq   0(%rcx),%rax,%r15
2610         adcxq   %rax,%r10
2611         adoxq   %r15,%r11
2612         mulxq   8(%rcx),%rax,%r15
2613         adcxq   %rax,%r11
2614         adoxq   %r15,%r12
2615         mulxq   16(%rcx),%rax,%r15
2616         movq    %r10,-40(%rbx)
2617         adcxq   %rax,%r12
2618         adoxq   %r15,%r13
2619         movq    %r11,-32(%rbx)
2620         mulxq   24(%rcx),%rax,%r15
2621         movq    %r9,%rdx
2622         leaq    32(%rcx),%rcx
2623         movq    %r12,-24(%rbx)
2624         adcxq   %rax,%r13
2625         adoxq   %rbp,%r15
2626         movq    %r13,-16(%rbx)
2627
2628         decq    %rdi
2629         jnz     .Lmulx4x_inner
2630
2631         movq    0+8(%rsp),%rax
2632         adcq    %rbp,%r15
2633         subq    0(%rbx),%rdi
2634         movq    8+8(%rsp),%rdi
2635         movq    16+8(%rsp),%r10
2636         adcq    %r15,%r14
2637         leaq    (%rsi,%rax,1),%rsi
2638         adcq    %rbp,%rbp
2639         movq    %r14,-8(%rbx)
2640
2641         cmpq    %r10,%rdi
2642         jb      .Lmulx4x_outer
2643
2644         movq    -8(%rcx),%r10
2645         movq    %rbp,%r8
2646         movq    (%rcx,%rax,1),%r12
2647         leaq    (%rcx,%rax,1),%rbp
2648         movq    %rax,%rcx
2649         leaq    (%rbx,%rax,1),%rdi
2650         xorl    %eax,%eax
2651         xorq    %r15,%r15
2652         subq    %r14,%r10
2653         adcq    %r15,%r15
2654         orq     %r15,%r8
2655         sarq    $3+2,%rcx
2656         subq    %r8,%rax
2657         movq    56+8(%rsp),%rdx
2658         decq    %r12
2659         movq    8(%rbp),%r13
2660         xorq    %r8,%r8
2661         movq    16(%rbp),%r14
2662         movq    24(%rbp),%r15
2663         jmp     .Lsqrx4x_sub_entry
2664 .size   mulx4x_internal,.-mulx4x_internal
2665 .type   bn_powerx5,@function
2666 .align  32
2667 bn_powerx5:
2668         movq    %rsp,%rax
2669 .Lpowerx5_enter:
2670         pushq   %rbx
2671         pushq   %rbp
2672         pushq   %r12
2673         pushq   %r13
2674         pushq   %r14
2675         pushq   %r15
2676 .Lpowerx5_prologue:
2677
2678         shll    $3,%r9d
2679         leaq    (%r9,%r9,2),%r10
2680         negq    %r9
2681         movq    (%r8),%r8
2682
2683
2684
2685
2686
2687
2688
2689
2690         leaq    -320(%rsp,%r9,2),%r11
2691         movq    %rsp,%rbp
2692         subq    %rdi,%r11
2693         andq    $4095,%r11
2694         cmpq    %r11,%r10
2695         jb      .Lpwrx_sp_alt
2696         subq    %r11,%rbp
2697         leaq    -320(%rbp,%r9,2),%rbp
2698         jmp     .Lpwrx_sp_done
2699
2700 .align  32
2701 .Lpwrx_sp_alt:
2702         leaq    4096-320(,%r9,2),%r10
2703         leaq    -320(%rbp,%r9,2),%rbp
2704         subq    %r10,%r11
2705         movq    $0,%r10
2706         cmovcq  %r10,%r11
2707         subq    %r11,%rbp
2708 .Lpwrx_sp_done:
2709         andq    $-64,%rbp
2710         movq    %rsp,%r11
2711         subq    %rbp,%r11
2712         andq    $-4096,%r11
2713         leaq    (%r11,%rbp,1),%rsp
2714         movq    (%rsp),%r10
2715         cmpq    %rbp,%rsp
2716         ja      .Lpwrx_page_walk
2717         jmp     .Lpwrx_page_walk_done
2718
2719 .Lpwrx_page_walk:
2720         leaq    -4096(%rsp),%rsp
2721         movq    (%rsp),%r10
2722         cmpq    %rbp,%rsp
2723         ja      .Lpwrx_page_walk
2724 .Lpwrx_page_walk_done:
2725
2726         movq    %r9,%r10
2727         negq    %r9
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740         pxor    %xmm0,%xmm0
2741 .byte   102,72,15,110,207
2742 .byte   102,72,15,110,209
2743 .byte   102,73,15,110,218
2744 .byte   102,72,15,110,226
2745         movq    %r8,32(%rsp)
2746         movq    %rax,40(%rsp)
2747 .Lpowerx5_body:
2748
2749         call    __bn_sqrx8x_internal
2750         call    __bn_postx4x_internal
2751         call    __bn_sqrx8x_internal
2752         call    __bn_postx4x_internal
2753         call    __bn_sqrx8x_internal
2754         call    __bn_postx4x_internal
2755         call    __bn_sqrx8x_internal
2756         call    __bn_postx4x_internal
2757         call    __bn_sqrx8x_internal
2758         call    __bn_postx4x_internal
2759
2760         movq    %r10,%r9
2761         movq    %rsi,%rdi
2762 .byte   102,72,15,126,209
2763 .byte   102,72,15,126,226
2764         movq    40(%rsp),%rax
2765
2766         call    mulx4x_internal
2767
2768         movq    40(%rsp),%rsi
2769         movq    $1,%rax
2770
2771         movq    -48(%rsi),%r15
2772         movq    -40(%rsi),%r14
2773         movq    -32(%rsi),%r13
2774         movq    -24(%rsi),%r12
2775         movq    -16(%rsi),%rbp
2776         movq    -8(%rsi),%rbx
2777         leaq    (%rsi),%rsp
2778 .Lpowerx5_epilogue:
2779         .byte   0xf3,0xc3
2780 .size   bn_powerx5,.-bn_powerx5
2781
2782 .globl  bn_sqrx8x_internal
2783 .hidden bn_sqrx8x_internal
2784 .type   bn_sqrx8x_internal,@function
2785 .align  32
2786 bn_sqrx8x_internal:
2787 __bn_sqrx8x_internal:
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828         leaq    48+8(%rsp),%rdi
2829         leaq    (%rsi,%r9,1),%rbp
2830         movq    %r9,0+8(%rsp)
2831         movq    %rbp,8+8(%rsp)
2832         jmp     .Lsqr8x_zero_start
2833
2834 .align  32
2835 .byte   0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2836 .Lsqrx8x_zero:
2837 .byte   0x3e
2838         movdqa  %xmm0,0(%rdi)
2839         movdqa  %xmm0,16(%rdi)
2840         movdqa  %xmm0,32(%rdi)
2841         movdqa  %xmm0,48(%rdi)
2842 .Lsqr8x_zero_start:
2843         movdqa  %xmm0,64(%rdi)
2844         movdqa  %xmm0,80(%rdi)
2845         movdqa  %xmm0,96(%rdi)
2846         movdqa  %xmm0,112(%rdi)
2847         leaq    128(%rdi),%rdi
2848         subq    $64,%r9
2849         jnz     .Lsqrx8x_zero
2850
2851         movq    0(%rsi),%rdx
2852
2853         xorq    %r10,%r10
2854         xorq    %r11,%r11
2855         xorq    %r12,%r12
2856         xorq    %r13,%r13
2857         xorq    %r14,%r14
2858         xorq    %r15,%r15
2859         leaq    48+8(%rsp),%rdi
2860         xorq    %rbp,%rbp
2861         jmp     .Lsqrx8x_outer_loop
2862
2863 .align  32
2864 .Lsqrx8x_outer_loop:
2865         mulxq   8(%rsi),%r8,%rax
2866         adcxq   %r9,%r8
2867         adoxq   %rax,%r10
2868         mulxq   16(%rsi),%r9,%rax
2869         adcxq   %r10,%r9
2870         adoxq   %rax,%r11
2871 .byte   0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2872         adcxq   %r11,%r10
2873         adoxq   %rax,%r12
2874 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2875         adcxq   %r12,%r11
2876         adoxq   %rax,%r13
2877         mulxq   40(%rsi),%r12,%rax
2878         adcxq   %r13,%r12
2879         adoxq   %rax,%r14
2880         mulxq   48(%rsi),%r13,%rax
2881         adcxq   %r14,%r13
2882         adoxq   %r15,%rax
2883         mulxq   56(%rsi),%r14,%r15
2884         movq    8(%rsi),%rdx
2885         adcxq   %rax,%r14
2886         adoxq   %rbp,%r15
2887         adcq    64(%rdi),%r15
2888         movq    %r8,8(%rdi)
2889         movq    %r9,16(%rdi)
2890         sbbq    %rcx,%rcx
2891         xorq    %rbp,%rbp
2892
2893
2894         mulxq   16(%rsi),%r8,%rbx
2895         mulxq   24(%rsi),%r9,%rax
2896         adcxq   %r10,%r8
2897         adoxq   %rbx,%r9
2898         mulxq   32(%rsi),%r10,%rbx
2899         adcxq   %r11,%r9
2900         adoxq   %rax,%r10
2901 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2902         adcxq   %r12,%r10
2903         adoxq   %rbx,%r11
2904 .byte   0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2905         adcxq   %r13,%r11
2906         adoxq   %r14,%r12
2907 .byte   0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2908         movq    16(%rsi),%rdx
2909         adcxq   %rax,%r12
2910         adoxq   %rbx,%r13
2911         adcxq   %r15,%r13
2912         adoxq   %rbp,%r14
2913         adcxq   %rbp,%r14
2914
2915         movq    %r8,24(%rdi)
2916         movq    %r9,32(%rdi)
2917
2918         mulxq   24(%rsi),%r8,%rbx
2919         mulxq   32(%rsi),%r9,%rax
2920         adcxq   %r10,%r8
2921         adoxq   %rbx,%r9
2922         mulxq   40(%rsi),%r10,%rbx
2923         adcxq   %r11,%r9
2924         adoxq   %rax,%r10
2925 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2926         adcxq   %r12,%r10
2927         adoxq   %r13,%r11
2928 .byte   0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2929 .byte   0x3e
2930         movq    24(%rsi),%rdx
2931         adcxq   %rbx,%r11
2932         adoxq   %rax,%r12
2933         adcxq   %r14,%r12
2934         movq    %r8,40(%rdi)
2935         movq    %r9,48(%rdi)
2936         mulxq   32(%rsi),%r8,%rax
2937         adoxq   %rbp,%r13
2938         adcxq   %rbp,%r13
2939
2940         mulxq   40(%rsi),%r9,%rbx
2941         adcxq   %r10,%r8
2942         adoxq   %rax,%r9
2943         mulxq   48(%rsi),%r10,%rax
2944         adcxq   %r11,%r9
2945         adoxq   %r12,%r10
2946         mulxq   56(%rsi),%r11,%r12
2947         movq    32(%rsi),%rdx
2948         movq    40(%rsi),%r14
2949         adcxq   %rbx,%r10
2950         adoxq   %rax,%r11
2951         movq    48(%rsi),%r15
2952         adcxq   %r13,%r11
2953         adoxq   %rbp,%r12
2954         adcxq   %rbp,%r12
2955
2956         movq    %r8,56(%rdi)
2957         movq    %r9,64(%rdi)
2958
2959         mulxq   %r14,%r9,%rax
2960         movq    56(%rsi),%r8
2961         adcxq   %r10,%r9
2962         mulxq   %r15,%r10,%rbx
2963         adoxq   %rax,%r10
2964         adcxq   %r11,%r10
2965         mulxq   %r8,%r11,%rax
2966         movq    %r14,%rdx
2967         adoxq   %rbx,%r11
2968         adcxq   %r12,%r11
2969
2970         adcxq   %rbp,%rax
2971
2972         mulxq   %r15,%r14,%rbx
2973         mulxq   %r8,%r12,%r13
2974         movq    %r15,%rdx
2975         leaq    64(%rsi),%rsi
2976         adcxq   %r14,%r11
2977         adoxq   %rbx,%r12
2978         adcxq   %rax,%r12
2979         adoxq   %rbp,%r13
2980
2981 .byte   0x67,0x67
2982         mulxq   %r8,%r8,%r14
2983         adcxq   %r8,%r13
2984         adcxq   %rbp,%r14
2985
2986         cmpq    8+8(%rsp),%rsi
2987         je      .Lsqrx8x_outer_break
2988
2989         negq    %rcx
2990         movq    $-8,%rcx
2991         movq    %rbp,%r15
2992         movq    64(%rdi),%r8
2993         adcxq   72(%rdi),%r9
2994         adcxq   80(%rdi),%r10
2995         adcxq   88(%rdi),%r11
2996         adcq    96(%rdi),%r12
2997         adcq    104(%rdi),%r13
2998         adcq    112(%rdi),%r14
2999         adcq    120(%rdi),%r15
3000         leaq    (%rsi),%rbp
3001         leaq    128(%rdi),%rdi
3002         sbbq    %rax,%rax
3003
3004         movq    -64(%rsi),%rdx
3005         movq    %rax,16+8(%rsp)
3006         movq    %rdi,24+8(%rsp)
3007
3008
3009         xorl    %eax,%eax
3010         jmp     .Lsqrx8x_loop
3011
3012 .align  32
3013 .Lsqrx8x_loop:
3014         movq    %r8,%rbx
3015         mulxq   0(%rbp),%rax,%r8
3016         adcxq   %rax,%rbx
3017         adoxq   %r9,%r8
3018
3019         mulxq   8(%rbp),%rax,%r9
3020         adcxq   %rax,%r8
3021         adoxq   %r10,%r9
3022
3023         mulxq   16(%rbp),%rax,%r10
3024         adcxq   %rax,%r9
3025         adoxq   %r11,%r10
3026
3027         mulxq   24(%rbp),%rax,%r11
3028         adcxq   %rax,%r10
3029         adoxq   %r12,%r11
3030
3031 .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3032         adcxq   %rax,%r11
3033         adoxq   %r13,%r12
3034
3035         mulxq   40(%rbp),%rax,%r13
3036         adcxq   %rax,%r12
3037         adoxq   %r14,%r13
3038
3039         mulxq   48(%rbp),%rax,%r14
3040         movq    %rbx,(%rdi,%rcx,8)
3041         movl    $0,%ebx
3042         adcxq   %rax,%r13
3043         adoxq   %r15,%r14
3044
3045 .byte   0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3046         movq    8(%rsi,%rcx,8),%rdx
3047         adcxq   %rax,%r14
3048         adoxq   %rbx,%r15
3049         adcxq   %rbx,%r15
3050
3051 .byte   0x67
3052         incq    %rcx
3053         jnz     .Lsqrx8x_loop
3054
3055         leaq    64(%rbp),%rbp
3056         movq    $-8,%rcx
3057         cmpq    8+8(%rsp),%rbp
3058         je      .Lsqrx8x_break
3059
3060         subq    16+8(%rsp),%rbx
3061 .byte   0x66
3062         movq    -64(%rsi),%rdx
3063         adcxq   0(%rdi),%r8
3064         adcxq   8(%rdi),%r9
3065         adcq    16(%rdi),%r10
3066         adcq    24(%rdi),%r11
3067         adcq    32(%rdi),%r12
3068         adcq    40(%rdi),%r13
3069         adcq    48(%rdi),%r14
3070         adcq    56(%rdi),%r15
3071         leaq    64(%rdi),%rdi
3072 .byte   0x67
3073         sbbq    %rax,%rax
3074         xorl    %ebx,%ebx
3075         movq    %rax,16+8(%rsp)
3076         jmp     .Lsqrx8x_loop
3077
3078 .align  32
3079 .Lsqrx8x_break:
3080         xorq    %rbp,%rbp
3081         subq    16+8(%rsp),%rbx
3082         adcxq   %rbp,%r8
3083         movq    24+8(%rsp),%rcx
3084         adcxq   %rbp,%r9
3085         movq    0(%rsi),%rdx
3086         adcq    $0,%r10
3087         movq    %r8,0(%rdi)
3088         adcq    $0,%r11
3089         adcq    $0,%r12
3090         adcq    $0,%r13
3091         adcq    $0,%r14
3092         adcq    $0,%r15
3093         cmpq    %rcx,%rdi
3094         je      .Lsqrx8x_outer_loop
3095
3096         movq    %r9,8(%rdi)
3097         movq    8(%rcx),%r9
3098         movq    %r10,16(%rdi)
3099         movq    16(%rcx),%r10
3100         movq    %r11,24(%rdi)
3101         movq    24(%rcx),%r11
3102         movq    %r12,32(%rdi)
3103         movq    32(%rcx),%r12
3104         movq    %r13,40(%rdi)
3105         movq    40(%rcx),%r13
3106         movq    %r14,48(%rdi)
3107         movq    48(%rcx),%r14
3108         movq    %r15,56(%rdi)
3109         movq    56(%rcx),%r15
3110         movq    %rcx,%rdi
3111         jmp     .Lsqrx8x_outer_loop
3112
3113 .align  32
3114 .Lsqrx8x_outer_break:
3115         movq    %r9,72(%rdi)
3116 .byte   102,72,15,126,217
3117         movq    %r10,80(%rdi)
3118         movq    %r11,88(%rdi)
3119         movq    %r12,96(%rdi)
3120         movq    %r13,104(%rdi)
3121         movq    %r14,112(%rdi)
3122         leaq    48+8(%rsp),%rdi
3123         movq    (%rsi,%rcx,1),%rdx
3124
3125         movq    8(%rdi),%r11
3126         xorq    %r10,%r10
3127         movq    0+8(%rsp),%r9
3128         adoxq   %r11,%r11
3129         movq    16(%rdi),%r12
3130         movq    24(%rdi),%r13
3131
3132
3133 .align  32
3134 .Lsqrx4x_shift_n_add:
3135         mulxq   %rdx,%rax,%rbx
3136         adoxq   %r12,%r12
3137         adcxq   %r10,%rax
3138 .byte   0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3139 .byte   0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3140         adoxq   %r13,%r13
3141         adcxq   %r11,%rbx
3142         movq    40(%rdi),%r11
3143         movq    %rax,0(%rdi)
3144         movq    %rbx,8(%rdi)
3145
3146         mulxq   %rdx,%rax,%rbx
3147         adoxq   %r10,%r10
3148         adcxq   %r12,%rax
3149         movq    16(%rsi,%rcx,1),%rdx
3150         movq    48(%rdi),%r12
3151         adoxq   %r11,%r11
3152         adcxq   %r13,%rbx
3153         movq    56(%rdi),%r13
3154         movq    %rax,16(%rdi)
3155         movq    %rbx,24(%rdi)
3156
3157         mulxq   %rdx,%rax,%rbx
3158         adoxq   %r12,%r12
3159         adcxq   %r10,%rax
3160         movq    24(%rsi,%rcx,1),%rdx
3161         leaq    32(%rcx),%rcx
3162         movq    64(%rdi),%r10
3163         adoxq   %r13,%r13
3164         adcxq   %r11,%rbx
3165         movq    72(%rdi),%r11
3166         movq    %rax,32(%rdi)
3167         movq    %rbx,40(%rdi)
3168
3169         mulxq   %rdx,%rax,%rbx
3170         adoxq   %r10,%r10
3171         adcxq   %r12,%rax
3172         jrcxz   .Lsqrx4x_shift_n_add_break
3173 .byte   0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3174         adoxq   %r11,%r11
3175         adcxq   %r13,%rbx
3176         movq    80(%rdi),%r12
3177         movq    88(%rdi),%r13
3178         movq    %rax,48(%rdi)
3179         movq    %rbx,56(%rdi)
3180         leaq    64(%rdi),%rdi
3181         nop
3182         jmp     .Lsqrx4x_shift_n_add
3183
3184 .align  32
3185 .Lsqrx4x_shift_n_add_break:
3186         adcxq   %r13,%rbx
3187         movq    %rax,48(%rdi)
3188         movq    %rbx,56(%rdi)
3189         leaq    64(%rdi),%rdi
3190 .byte   102,72,15,126,213
3191 __bn_sqrx8x_reduction:
3192         xorl    %eax,%eax
3193         movq    32+8(%rsp),%rbx
3194         movq    48+8(%rsp),%rdx
3195         leaq    -64(%rbp,%r9,1),%rcx
3196
3197         movq    %rcx,0+8(%rsp)
3198         movq    %rdi,8+8(%rsp)
3199
3200         leaq    48+8(%rsp),%rdi
3201         jmp     .Lsqrx8x_reduction_loop
3202
3203 .align  32
3204 .Lsqrx8x_reduction_loop:
3205         movq    8(%rdi),%r9
3206         movq    16(%rdi),%r10
3207         movq    24(%rdi),%r11
3208         movq    32(%rdi),%r12
3209         movq    %rdx,%r8
3210         imulq   %rbx,%rdx
3211         movq    40(%rdi),%r13
3212         movq    48(%rdi),%r14
3213         movq    56(%rdi),%r15
3214         movq    %rax,24+8(%rsp)
3215
3216         leaq    64(%rdi),%rdi
3217         xorq    %rsi,%rsi
3218         movq    $-8,%rcx
3219         jmp     .Lsqrx8x_reduce
3220
3221 .align  32
3222 .Lsqrx8x_reduce:
3223         movq    %r8,%rbx
3224         mulxq   0(%rbp),%rax,%r8
3225         adcxq   %rbx,%rax
3226         adoxq   %r9,%r8
3227
3228         mulxq   8(%rbp),%rbx,%r9
3229         adcxq   %rbx,%r8
3230         adoxq   %r10,%r9
3231
3232         mulxq   16(%rbp),%rbx,%r10
3233         adcxq   %rbx,%r9
3234         adoxq   %r11,%r10
3235
3236         mulxq   24(%rbp),%rbx,%r11
3237         adcxq   %rbx,%r10
3238         adoxq   %r12,%r11
3239
3240 .byte   0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3241         movq    %rdx,%rax
3242         movq    %r8,%rdx
3243         adcxq   %rbx,%r11
3244         adoxq   %r13,%r12
3245
3246         mulxq   32+8(%rsp),%rbx,%rdx
3247         movq    %rax,%rdx
3248         movq    %rax,64+48+8(%rsp,%rcx,8)
3249
3250         mulxq   40(%rbp),%rax,%r13
3251         adcxq   %rax,%r12
3252         adoxq   %r14,%r13
3253
3254         mulxq   48(%rbp),%rax,%r14
3255         adcxq   %rax,%r13
3256         adoxq   %r15,%r14
3257
3258         mulxq   56(%rbp),%rax,%r15
3259         movq    %rbx,%rdx
3260         adcxq   %rax,%r14
3261         adoxq   %rsi,%r15
3262         adcxq   %rsi,%r15
3263
3264 .byte   0x67,0x67,0x67
3265         incq    %rcx
3266         jnz     .Lsqrx8x_reduce
3267
3268         movq    %rsi,%rax
3269         cmpq    0+8(%rsp),%rbp
3270         jae     .Lsqrx8x_no_tail
3271
3272         movq    48+8(%rsp),%rdx
3273         addq    0(%rdi),%r8
3274         leaq    64(%rbp),%rbp
3275         movq    $-8,%rcx
3276         adcxq   8(%rdi),%r9
3277         adcxq   16(%rdi),%r10
3278         adcq    24(%rdi),%r11
3279         adcq    32(%rdi),%r12
3280         adcq    40(%rdi),%r13
3281         adcq    48(%rdi),%r14
3282         adcq    56(%rdi),%r15
3283         leaq    64(%rdi),%rdi
3284         sbbq    %rax,%rax
3285
3286         xorq    %rsi,%rsi
3287         movq    %rax,16+8(%rsp)
3288         jmp     .Lsqrx8x_tail
3289
3290 .align  32
3291 .Lsqrx8x_tail:
3292         movq    %r8,%rbx
3293         mulxq   0(%rbp),%rax,%r8
3294         adcxq   %rax,%rbx
3295         adoxq   %r9,%r8
3296
3297         mulxq   8(%rbp),%rax,%r9
3298         adcxq   %rax,%r8
3299         adoxq   %r10,%r9
3300
3301         mulxq   16(%rbp),%rax,%r10
3302         adcxq   %rax,%r9
3303         adoxq   %r11,%r10
3304
3305         mulxq   24(%rbp),%rax,%r11
3306         adcxq   %rax,%r10
3307         adoxq   %r12,%r11
3308
3309 .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3310         adcxq   %rax,%r11
3311         adoxq   %r13,%r12
3312
3313         mulxq   40(%rbp),%rax,%r13
3314         adcxq   %rax,%r12
3315         adoxq   %r14,%r13
3316
3317         mulxq   48(%rbp),%rax,%r14
3318         adcxq   %rax,%r13
3319         adoxq   %r15,%r14
3320
3321         mulxq   56(%rbp),%rax,%r15
3322         movq    72+48+8(%rsp,%rcx,8),%rdx
3323         adcxq   %rax,%r14
3324         adoxq   %rsi,%r15
3325         movq    %rbx,(%rdi,%rcx,8)
3326         movq    %r8,%rbx
3327         adcxq   %rsi,%r15
3328
3329         incq    %rcx
3330         jnz     .Lsqrx8x_tail
3331
3332         cmpq    0+8(%rsp),%rbp
3333         jae     .Lsqrx8x_tail_done
3334
3335         subq    16+8(%rsp),%rsi
3336         movq    48+8(%rsp),%rdx
3337         leaq    64(%rbp),%rbp
3338         adcq    0(%rdi),%r8
3339         adcq    8(%rdi),%r9
3340         adcq    16(%rdi),%r10
3341         adcq    24(%rdi),%r11
3342         adcq    32(%rdi),%r12
3343         adcq    40(%rdi),%r13
3344         adcq    48(%rdi),%r14
3345         adcq    56(%rdi),%r15
3346         leaq    64(%rdi),%rdi
3347         sbbq    %rax,%rax
3348         subq    $8,%rcx
3349
3350         xorq    %rsi,%rsi
3351         movq    %rax,16+8(%rsp)
3352         jmp     .Lsqrx8x_tail
3353
3354 .align  32
3355 .Lsqrx8x_tail_done:
3356         xorq    %rax,%rax
3357         addq    24+8(%rsp),%r8
3358         adcq    $0,%r9
3359         adcq    $0,%r10
3360         adcq    $0,%r11
3361         adcq    $0,%r12
3362         adcq    $0,%r13
3363         adcq    $0,%r14
3364         adcq    $0,%r15
3365         adcq    $0,%rax
3366
3367         subq    16+8(%rsp),%rsi
3368 .Lsqrx8x_no_tail:
3369         adcq    0(%rdi),%r8
3370 .byte   102,72,15,126,217
3371         adcq    8(%rdi),%r9
3372         movq    56(%rbp),%rsi
3373 .byte   102,72,15,126,213
3374         adcq    16(%rdi),%r10
3375         adcq    24(%rdi),%r11
3376         adcq    32(%rdi),%r12
3377         adcq    40(%rdi),%r13
3378         adcq    48(%rdi),%r14
3379         adcq    56(%rdi),%r15
3380         adcq    $0,%rax
3381
3382         movq    32+8(%rsp),%rbx
3383         movq    64(%rdi,%rcx,1),%rdx
3384
3385         movq    %r8,0(%rdi)
3386         leaq    64(%rdi),%r8
3387         movq    %r9,8(%rdi)
3388         movq    %r10,16(%rdi)
3389         movq    %r11,24(%rdi)
3390         movq    %r12,32(%rdi)
3391         movq    %r13,40(%rdi)
3392         movq    %r14,48(%rdi)
3393         movq    %r15,56(%rdi)
3394
3395         leaq    64(%rdi,%rcx,1),%rdi
3396         cmpq    8+8(%rsp),%r8
3397         jb      .Lsqrx8x_reduction_loop
3398         .byte   0xf3,0xc3
3399 .size   bn_sqrx8x_internal,.-bn_sqrx8x_internal
3400 .align  32
3401 __bn_postx4x_internal:
3402         movq    0(%rbp),%r12
3403         movq    %rcx,%r10
3404         movq    %rcx,%r9
3405         negq    %rax
3406         sarq    $3+2,%rcx
3407
3408 .byte   102,72,15,126,202
3409 .byte   102,72,15,126,206
3410         decq    %r12
3411         movq    8(%rbp),%r13
3412         xorq    %r8,%r8
3413         movq    16(%rbp),%r14
3414         movq    24(%rbp),%r15
3415         jmp     .Lsqrx4x_sub_entry
3416
3417 .align  16
3418 .Lsqrx4x_sub:
3419         movq    0(%rbp),%r12
3420         movq    8(%rbp),%r13
3421         movq    16(%rbp),%r14
3422         movq    24(%rbp),%r15
3423 .Lsqrx4x_sub_entry:
3424         andnq   %rax,%r12,%r12
3425         leaq    32(%rbp),%rbp
3426         andnq   %rax,%r13,%r13
3427         andnq   %rax,%r14,%r14
3428         andnq   %rax,%r15,%r15
3429
3430         negq    %r8
3431         adcq    0(%rdi),%r12
3432         adcq    8(%rdi),%r13
3433         adcq    16(%rdi),%r14
3434         adcq    24(%rdi),%r15
3435         movq    %r12,0(%rdx)
3436         leaq    32(%rdi),%rdi
3437         movq    %r13,8(%rdx)
3438         sbbq    %r8,%r8
3439         movq    %r14,16(%rdx)
3440         movq    %r15,24(%rdx)
3441         leaq    32(%rdx),%rdx
3442
3443         incq    %rcx
3444         jnz     .Lsqrx4x_sub
3445
3446         negq    %r9
3447
3448         .byte   0xf3,0xc3
3449 .size   __bn_postx4x_internal,.-__bn_postx4x_internal
3450 .globl  bn_get_bits5
3451 .type   bn_get_bits5,@function
3452 .align  16
3453 bn_get_bits5:
3454         leaq    0(%rdi),%r10
3455         leaq    1(%rdi),%r11
3456         movl    %esi,%ecx
3457         shrl    $4,%esi
3458         andl    $15,%ecx
3459         leal    -8(%rcx),%eax
3460         cmpl    $11,%ecx
3461         cmovaq  %r11,%r10
3462         cmoval  %eax,%ecx
3463         movzwl  (%r10,%rsi,2),%eax
3464         shrl    %cl,%eax
3465         andl    $31,%eax
3466         .byte   0xf3,0xc3
3467 .size   bn_get_bits5,.-bn_get_bits5
3468
3469 .globl  bn_scatter5
3470 .type   bn_scatter5,@function
3471 .align  16
3472 bn_scatter5:
3473         cmpl    $0,%esi
3474         jz      .Lscatter_epilogue
3475         leaq    (%rdx,%rcx,8),%rdx
3476 .Lscatter:
3477         movq    (%rdi),%rax
3478         leaq    8(%rdi),%rdi
3479         movq    %rax,(%rdx)
3480         leaq    256(%rdx),%rdx
3481         subl    $1,%esi
3482         jnz     .Lscatter
3483 .Lscatter_epilogue:
3484         .byte   0xf3,0xc3
3485 .size   bn_scatter5,.-bn_scatter5
3486
3487 .globl  bn_gather5
3488 .type   bn_gather5,@function
3489 .align  32
3490 bn_gather5:
3491 .LSEH_begin_bn_gather5:
3492
3493 .byte   0x4c,0x8d,0x14,0x24
3494 .byte   0x48,0x81,0xec,0x08,0x01,0x00,0x00
3495         leaq    .Linc(%rip),%rax
3496         andq    $-16,%rsp
3497
3498         movd    %ecx,%xmm5
3499         movdqa  0(%rax),%xmm0
3500         movdqa  16(%rax),%xmm1
3501         leaq    128(%rdx),%r11
3502         leaq    128(%rsp),%rax
3503
3504         pshufd  $0,%xmm5,%xmm5
3505         movdqa  %xmm1,%xmm4
3506         movdqa  %xmm1,%xmm2
3507         paddd   %xmm0,%xmm1
3508         pcmpeqd %xmm5,%xmm0
3509         movdqa  %xmm4,%xmm3
3510
3511         paddd   %xmm1,%xmm2
3512         pcmpeqd %xmm5,%xmm1
3513         movdqa  %xmm0,-128(%rax)
3514         movdqa  %xmm4,%xmm0
3515
3516         paddd   %xmm2,%xmm3
3517         pcmpeqd %xmm5,%xmm2
3518         movdqa  %xmm1,-112(%rax)
3519         movdqa  %xmm4,%xmm1
3520
3521         paddd   %xmm3,%xmm0
3522         pcmpeqd %xmm5,%xmm3
3523         movdqa  %xmm2,-96(%rax)
3524         movdqa  %xmm4,%xmm2
3525         paddd   %xmm0,%xmm1
3526         pcmpeqd %xmm5,%xmm0
3527         movdqa  %xmm3,-80(%rax)
3528         movdqa  %xmm4,%xmm3
3529
3530         paddd   %xmm1,%xmm2
3531         pcmpeqd %xmm5,%xmm1
3532         movdqa  %xmm0,-64(%rax)
3533         movdqa  %xmm4,%xmm0
3534
3535         paddd   %xmm2,%xmm3
3536         pcmpeqd %xmm5,%xmm2
3537         movdqa  %xmm1,-48(%rax)
3538         movdqa  %xmm4,%xmm1
3539
3540         paddd   %xmm3,%xmm0
3541         pcmpeqd %xmm5,%xmm3
3542         movdqa  %xmm2,-32(%rax)
3543         movdqa  %xmm4,%xmm2
3544         paddd   %xmm0,%xmm1
3545         pcmpeqd %xmm5,%xmm0
3546         movdqa  %xmm3,-16(%rax)
3547         movdqa  %xmm4,%xmm3
3548
3549         paddd   %xmm1,%xmm2
3550         pcmpeqd %xmm5,%xmm1
3551         movdqa  %xmm0,0(%rax)
3552         movdqa  %xmm4,%xmm0
3553
3554         paddd   %xmm2,%xmm3
3555         pcmpeqd %xmm5,%xmm2
3556         movdqa  %xmm1,16(%rax)
3557         movdqa  %xmm4,%xmm1
3558
3559         paddd   %xmm3,%xmm0
3560         pcmpeqd %xmm5,%xmm3
3561         movdqa  %xmm2,32(%rax)
3562         movdqa  %xmm4,%xmm2
3563         paddd   %xmm0,%xmm1
3564         pcmpeqd %xmm5,%xmm0
3565         movdqa  %xmm3,48(%rax)
3566         movdqa  %xmm4,%xmm3
3567
3568         paddd   %xmm1,%xmm2
3569         pcmpeqd %xmm5,%xmm1
3570         movdqa  %xmm0,64(%rax)
3571         movdqa  %xmm4,%xmm0
3572
3573         paddd   %xmm2,%xmm3
3574         pcmpeqd %xmm5,%xmm2
3575         movdqa  %xmm1,80(%rax)
3576         movdqa  %xmm4,%xmm1
3577
3578         paddd   %xmm3,%xmm0
3579         pcmpeqd %xmm5,%xmm3
3580         movdqa  %xmm2,96(%rax)
3581         movdqa  %xmm4,%xmm2
3582         movdqa  %xmm3,112(%rax)
3583         jmp     .Lgather
3584
3585 .align  32
3586 .Lgather:
3587         pxor    %xmm4,%xmm4
3588         pxor    %xmm5,%xmm5
3589         movdqa  -128(%r11),%xmm0
3590         movdqa  -112(%r11),%xmm1
3591         movdqa  -96(%r11),%xmm2
3592         pand    -128(%rax),%xmm0
3593         movdqa  -80(%r11),%xmm3
3594         pand    -112(%rax),%xmm1
3595         por     %xmm0,%xmm4
3596         pand    -96(%rax),%xmm2
3597         por     %xmm1,%xmm5
3598         pand    -80(%rax),%xmm3
3599         por     %xmm2,%xmm4
3600         por     %xmm3,%xmm5
3601         movdqa  -64(%r11),%xmm0
3602         movdqa  -48(%r11),%xmm1
3603         movdqa  -32(%r11),%xmm2
3604         pand    -64(%rax),%xmm0
3605         movdqa  -16(%r11),%xmm3
3606         pand    -48(%rax),%xmm1
3607         por     %xmm0,%xmm4
3608         pand    -32(%rax),%xmm2
3609         por     %xmm1,%xmm5
3610         pand    -16(%rax),%xmm3
3611         por     %xmm2,%xmm4
3612         por     %xmm3,%xmm5
3613         movdqa  0(%r11),%xmm0
3614         movdqa  16(%r11),%xmm1
3615         movdqa  32(%r11),%xmm2
3616         pand    0(%rax),%xmm0
3617         movdqa  48(%r11),%xmm3
3618         pand    16(%rax),%xmm1
3619         por     %xmm0,%xmm4
3620         pand    32(%rax),%xmm2
3621         por     %xmm1,%xmm5
3622         pand    48(%rax),%xmm3
3623         por     %xmm2,%xmm4
3624         por     %xmm3,%xmm5
3625         movdqa  64(%r11),%xmm0
3626         movdqa  80(%r11),%xmm1
3627         movdqa  96(%r11),%xmm2
3628         pand    64(%rax),%xmm0
3629         movdqa  112(%r11),%xmm3
3630         pand    80(%rax),%xmm1
3631         por     %xmm0,%xmm4
3632         pand    96(%rax),%xmm2
3633         por     %xmm1,%xmm5
3634         pand    112(%rax),%xmm3
3635         por     %xmm2,%xmm4
3636         por     %xmm3,%xmm5
3637         por     %xmm5,%xmm4
3638         leaq    256(%r11),%r11
3639         pshufd  $0x4e,%xmm4,%xmm0
3640         por     %xmm4,%xmm0
3641         movq    %xmm0,(%rdi)
3642         leaq    8(%rdi),%rdi
3643         subl    $1,%esi
3644         jnz     .Lgather
3645
3646         leaq    (%r10),%rsp
3647         .byte   0xf3,0xc3
3648 .LSEH_end_bn_gather5:
3649 .size   bn_gather5,.-bn_gather5
3650 .align  64
3651 .Linc:
3652 .long   0,0, 1,1
3653 .long   2,2, 2,2
3654 .byte   77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0