]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/amd64/x86_64-mont5.S
MFC: r337791
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / amd64 / x86_64-mont5.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
3 .text   
4
5
6
7 .globl  bn_mul_mont_gather5
8 .type   bn_mul_mont_gather5,@function
9 .align  64
10 bn_mul_mont_gather5:
11         movl    %r9d,%r9d
12         movq    %rsp,%rax
13         testl   $7,%r9d
14         jnz     .Lmul_enter
15         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
16         jmp     .Lmul4x_enter
17
18 .align  16
19 .Lmul_enter:
20         movd    8(%rsp),%xmm5
21         pushq   %rbx
22         pushq   %rbp
23         pushq   %r12
24         pushq   %r13
25         pushq   %r14
26         pushq   %r15
27
28         negq    %r9
29         movq    %rsp,%r11
30         leaq    -280(%rsp,%r9,8),%r10
31         negq    %r9
32         andq    $-1024,%r10
33
34
35
36
37
38
39
40         subq    %r10,%r11
41         andq    $-4096,%r11
42         leaq    (%r10,%r11,1),%rsp
43         movq    (%rsp),%r11
44         cmpq    %r10,%rsp
45         ja      .Lmul_page_walk
46         jmp     .Lmul_page_walk_done
47
48 .Lmul_page_walk:
49         leaq    -4096(%rsp),%rsp
50         movq    (%rsp),%r11
51         cmpq    %r10,%rsp
52         ja      .Lmul_page_walk
53 .Lmul_page_walk_done:
54
55         leaq    .Linc(%rip),%r10
56         movq    %rax,8(%rsp,%r9,8)
57 .Lmul_body:
58
59         leaq    128(%rdx),%r12
60         movdqa  0(%r10),%xmm0
61         movdqa  16(%r10),%xmm1
62         leaq    24-112(%rsp,%r9,8),%r10
63         andq    $-16,%r10
64
65         pshufd  $0,%xmm5,%xmm5
66         movdqa  %xmm1,%xmm4
67         movdqa  %xmm1,%xmm2
68         paddd   %xmm0,%xmm1
69         pcmpeqd %xmm5,%xmm0
70 .byte   0x67
71         movdqa  %xmm4,%xmm3
72         paddd   %xmm1,%xmm2
73         pcmpeqd %xmm5,%xmm1
74         movdqa  %xmm0,112(%r10)
75         movdqa  %xmm4,%xmm0
76
77         paddd   %xmm2,%xmm3
78         pcmpeqd %xmm5,%xmm2
79         movdqa  %xmm1,128(%r10)
80         movdqa  %xmm4,%xmm1
81
82         paddd   %xmm3,%xmm0
83         pcmpeqd %xmm5,%xmm3
84         movdqa  %xmm2,144(%r10)
85         movdqa  %xmm4,%xmm2
86
87         paddd   %xmm0,%xmm1
88         pcmpeqd %xmm5,%xmm0
89         movdqa  %xmm3,160(%r10)
90         movdqa  %xmm4,%xmm3
91         paddd   %xmm1,%xmm2
92         pcmpeqd %xmm5,%xmm1
93         movdqa  %xmm0,176(%r10)
94         movdqa  %xmm4,%xmm0
95
96         paddd   %xmm2,%xmm3
97         pcmpeqd %xmm5,%xmm2
98         movdqa  %xmm1,192(%r10)
99         movdqa  %xmm4,%xmm1
100
101         paddd   %xmm3,%xmm0
102         pcmpeqd %xmm5,%xmm3
103         movdqa  %xmm2,208(%r10)
104         movdqa  %xmm4,%xmm2
105
106         paddd   %xmm0,%xmm1
107         pcmpeqd %xmm5,%xmm0
108         movdqa  %xmm3,224(%r10)
109         movdqa  %xmm4,%xmm3
110         paddd   %xmm1,%xmm2
111         pcmpeqd %xmm5,%xmm1
112         movdqa  %xmm0,240(%r10)
113         movdqa  %xmm4,%xmm0
114
115         paddd   %xmm2,%xmm3
116         pcmpeqd %xmm5,%xmm2
117         movdqa  %xmm1,256(%r10)
118         movdqa  %xmm4,%xmm1
119
120         paddd   %xmm3,%xmm0
121         pcmpeqd %xmm5,%xmm3
122         movdqa  %xmm2,272(%r10)
123         movdqa  %xmm4,%xmm2
124
125         paddd   %xmm0,%xmm1
126         pcmpeqd %xmm5,%xmm0
127         movdqa  %xmm3,288(%r10)
128         movdqa  %xmm4,%xmm3
129         paddd   %xmm1,%xmm2
130         pcmpeqd %xmm5,%xmm1
131         movdqa  %xmm0,304(%r10)
132
133         paddd   %xmm2,%xmm3
134 .byte   0x67
135         pcmpeqd %xmm5,%xmm2
136         movdqa  %xmm1,320(%r10)
137
138         pcmpeqd %xmm5,%xmm3
139         movdqa  %xmm2,336(%r10)
140         pand    64(%r12),%xmm0
141
142         pand    80(%r12),%xmm1
143         pand    96(%r12),%xmm2
144         movdqa  %xmm3,352(%r10)
145         pand    112(%r12),%xmm3
146         por     %xmm2,%xmm0
147         por     %xmm3,%xmm1
148         movdqa  -128(%r12),%xmm4
149         movdqa  -112(%r12),%xmm5
150         movdqa  -96(%r12),%xmm2
151         pand    112(%r10),%xmm4
152         movdqa  -80(%r12),%xmm3
153         pand    128(%r10),%xmm5
154         por     %xmm4,%xmm0
155         pand    144(%r10),%xmm2
156         por     %xmm5,%xmm1
157         pand    160(%r10),%xmm3
158         por     %xmm2,%xmm0
159         por     %xmm3,%xmm1
160         movdqa  -64(%r12),%xmm4
161         movdqa  -48(%r12),%xmm5
162         movdqa  -32(%r12),%xmm2
163         pand    176(%r10),%xmm4
164         movdqa  -16(%r12),%xmm3
165         pand    192(%r10),%xmm5
166         por     %xmm4,%xmm0
167         pand    208(%r10),%xmm2
168         por     %xmm5,%xmm1
169         pand    224(%r10),%xmm3
170         por     %xmm2,%xmm0
171         por     %xmm3,%xmm1
172         movdqa  0(%r12),%xmm4
173         movdqa  16(%r12),%xmm5
174         movdqa  32(%r12),%xmm2
175         pand    240(%r10),%xmm4
176         movdqa  48(%r12),%xmm3
177         pand    256(%r10),%xmm5
178         por     %xmm4,%xmm0
179         pand    272(%r10),%xmm2
180         por     %xmm5,%xmm1
181         pand    288(%r10),%xmm3
182         por     %xmm2,%xmm0
183         por     %xmm3,%xmm1
184         por     %xmm1,%xmm0
185         pshufd  $0x4e,%xmm0,%xmm1
186         por     %xmm1,%xmm0
187         leaq    256(%r12),%r12
188 .byte   102,72,15,126,195
189
190         movq    (%r8),%r8
191         movq    (%rsi),%rax
192
193         xorq    %r14,%r14
194         xorq    %r15,%r15
195
196         movq    %r8,%rbp
197         mulq    %rbx
198         movq    %rax,%r10
199         movq    (%rcx),%rax
200
201         imulq   %r10,%rbp
202         movq    %rdx,%r11
203
204         mulq    %rbp
205         addq    %rax,%r10
206         movq    8(%rsi),%rax
207         adcq    $0,%rdx
208         movq    %rdx,%r13
209
210         leaq    1(%r15),%r15
211         jmp     .L1st_enter
212
213 .align  16
214 .L1st:
215         addq    %rax,%r13
216         movq    (%rsi,%r15,8),%rax
217         adcq    $0,%rdx
218         addq    %r11,%r13
219         movq    %r10,%r11
220         adcq    $0,%rdx
221         movq    %r13,-16(%rsp,%r15,8)
222         movq    %rdx,%r13
223
224 .L1st_enter:
225         mulq    %rbx
226         addq    %rax,%r11
227         movq    (%rcx,%r15,8),%rax
228         adcq    $0,%rdx
229         leaq    1(%r15),%r15
230         movq    %rdx,%r10
231
232         mulq    %rbp
233         cmpq    %r9,%r15
234         jne     .L1st
235
236
237         addq    %rax,%r13
238         adcq    $0,%rdx
239         addq    %r11,%r13
240         adcq    $0,%rdx
241         movq    %r13,-16(%rsp,%r9,8)
242         movq    %rdx,%r13
243         movq    %r10,%r11
244
245         xorq    %rdx,%rdx
246         addq    %r11,%r13
247         adcq    $0,%rdx
248         movq    %r13,-8(%rsp,%r9,8)
249         movq    %rdx,(%rsp,%r9,8)
250
251         leaq    1(%r14),%r14
252         jmp     .Louter
253 .align  16
254 .Louter:
255         leaq    24+128(%rsp,%r9,8),%rdx
256         andq    $-16,%rdx
257         pxor    %xmm4,%xmm4
258         pxor    %xmm5,%xmm5
259         movdqa  -128(%r12),%xmm0
260         movdqa  -112(%r12),%xmm1
261         movdqa  -96(%r12),%xmm2
262         movdqa  -80(%r12),%xmm3
263         pand    -128(%rdx),%xmm0
264         pand    -112(%rdx),%xmm1
265         por     %xmm0,%xmm4
266         pand    -96(%rdx),%xmm2
267         por     %xmm1,%xmm5
268         pand    -80(%rdx),%xmm3
269         por     %xmm2,%xmm4
270         por     %xmm3,%xmm5
271         movdqa  -64(%r12),%xmm0
272         movdqa  -48(%r12),%xmm1
273         movdqa  -32(%r12),%xmm2
274         movdqa  -16(%r12),%xmm3
275         pand    -64(%rdx),%xmm0
276         pand    -48(%rdx),%xmm1
277         por     %xmm0,%xmm4
278         pand    -32(%rdx),%xmm2
279         por     %xmm1,%xmm5
280         pand    -16(%rdx),%xmm3
281         por     %xmm2,%xmm4
282         por     %xmm3,%xmm5
283         movdqa  0(%r12),%xmm0
284         movdqa  16(%r12),%xmm1
285         movdqa  32(%r12),%xmm2
286         movdqa  48(%r12),%xmm3
287         pand    0(%rdx),%xmm0
288         pand    16(%rdx),%xmm1
289         por     %xmm0,%xmm4
290         pand    32(%rdx),%xmm2
291         por     %xmm1,%xmm5
292         pand    48(%rdx),%xmm3
293         por     %xmm2,%xmm4
294         por     %xmm3,%xmm5
295         movdqa  64(%r12),%xmm0
296         movdqa  80(%r12),%xmm1
297         movdqa  96(%r12),%xmm2
298         movdqa  112(%r12),%xmm3
299         pand    64(%rdx),%xmm0
300         pand    80(%rdx),%xmm1
301         por     %xmm0,%xmm4
302         pand    96(%rdx),%xmm2
303         por     %xmm1,%xmm5
304         pand    112(%rdx),%xmm3
305         por     %xmm2,%xmm4
306         por     %xmm3,%xmm5
307         por     %xmm5,%xmm4
308         pshufd  $0x4e,%xmm4,%xmm0
309         por     %xmm4,%xmm0
310         leaq    256(%r12),%r12
311
312         movq    (%rsi),%rax
313 .byte   102,72,15,126,195
314
315         xorq    %r15,%r15
316         movq    %r8,%rbp
317         movq    (%rsp),%r10
318
319         mulq    %rbx
320         addq    %rax,%r10
321         movq    (%rcx),%rax
322         adcq    $0,%rdx
323
324         imulq   %r10,%rbp
325         movq    %rdx,%r11
326
327         mulq    %rbp
328         addq    %rax,%r10
329         movq    8(%rsi),%rax
330         adcq    $0,%rdx
331         movq    8(%rsp),%r10
332         movq    %rdx,%r13
333
334         leaq    1(%r15),%r15
335         jmp     .Linner_enter
336
337 .align  16
338 .Linner:
339         addq    %rax,%r13
340         movq    (%rsi,%r15,8),%rax
341         adcq    $0,%rdx
342         addq    %r10,%r13
343         movq    (%rsp,%r15,8),%r10
344         adcq    $0,%rdx
345         movq    %r13,-16(%rsp,%r15,8)
346         movq    %rdx,%r13
347
348 .Linner_enter:
349         mulq    %rbx
350         addq    %rax,%r11
351         movq    (%rcx,%r15,8),%rax
352         adcq    $0,%rdx
353         addq    %r11,%r10
354         movq    %rdx,%r11
355         adcq    $0,%r11
356         leaq    1(%r15),%r15
357
358         mulq    %rbp
359         cmpq    %r9,%r15
360         jne     .Linner
361
362         addq    %rax,%r13
363         adcq    $0,%rdx
364         addq    %r10,%r13
365         movq    (%rsp,%r9,8),%r10
366         adcq    $0,%rdx
367         movq    %r13,-16(%rsp,%r9,8)
368         movq    %rdx,%r13
369
370         xorq    %rdx,%rdx
371         addq    %r11,%r13
372         adcq    $0,%rdx
373         addq    %r10,%r13
374         adcq    $0,%rdx
375         movq    %r13,-8(%rsp,%r9,8)
376         movq    %rdx,(%rsp,%r9,8)
377
378         leaq    1(%r14),%r14
379         cmpq    %r9,%r14
380         jb      .Louter
381
382         xorq    %r14,%r14
383         movq    (%rsp),%rax
384         leaq    (%rsp),%rsi
385         movq    %r9,%r15
386         jmp     .Lsub
387 .align  16
388 .Lsub:  sbbq    (%rcx,%r14,8),%rax
389         movq    %rax,(%rdi,%r14,8)
390         movq    8(%rsi,%r14,8),%rax
391         leaq    1(%r14),%r14
392         decq    %r15
393         jnz     .Lsub
394
395         sbbq    $0,%rax
396         movq    $-1,%rbx
397         xorq    %rax,%rbx
398         xorq    %r14,%r14
399         movq    %r9,%r15
400
401 .Lcopy:
402         movq    (%rdi,%r14,8),%rcx
403         movq    (%rsp,%r14,8),%rdx
404         andq    %rbx,%rcx
405         andq    %rax,%rdx
406         movq    %r14,(%rsp,%r14,8)
407         orq     %rcx,%rdx
408         movq    %rdx,(%rdi,%r14,8)
409         leaq    1(%r14),%r14
410         subq    $1,%r15
411         jnz     .Lcopy
412
413         movq    8(%rsp,%r9,8),%rsi
414         movq    $1,%rax
415
416         movq    -48(%rsi),%r15
417         movq    -40(%rsi),%r14
418         movq    -32(%rsi),%r13
419         movq    -24(%rsi),%r12
420         movq    -16(%rsi),%rbp
421         movq    -8(%rsi),%rbx
422         leaq    (%rsi),%rsp
423 .Lmul_epilogue:
424         .byte   0xf3,0xc3
425 .size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
426 .type   bn_mul4x_mont_gather5,@function
427 .align  32
428 bn_mul4x_mont_gather5:
429 .byte   0x67
430         movq    %rsp,%rax
431 .Lmul4x_enter:
432         andl    $0x80108,%r11d
433         cmpl    $0x80108,%r11d
434         je      .Lmulx4x_enter
435         pushq   %rbx
436         pushq   %rbp
437         pushq   %r12
438         pushq   %r13
439         pushq   %r14
440         pushq   %r15
441 .Lmul4x_prologue:
442
443 .byte   0x67
444         shll    $3,%r9d
445         leaq    (%r9,%r9,2),%r10
446         negq    %r9
447
448
449
450
451
452
453
454
455
456
457         leaq    -320(%rsp,%r9,2),%r11
458         movq    %rsp,%rbp
459         subq    %rdi,%r11
460         andq    $4095,%r11
461         cmpq    %r11,%r10
462         jb      .Lmul4xsp_alt
463         subq    %r11,%rbp
464         leaq    -320(%rbp,%r9,2),%rbp
465         jmp     .Lmul4xsp_done
466
467 .align  32
468 .Lmul4xsp_alt:
469         leaq    4096-320(,%r9,2),%r10
470         leaq    -320(%rbp,%r9,2),%rbp
471         subq    %r10,%r11
472         movq    $0,%r10
473         cmovcq  %r10,%r11
474         subq    %r11,%rbp
475 .Lmul4xsp_done:
476         andq    $-64,%rbp
477         movq    %rsp,%r11
478         subq    %rbp,%r11
479         andq    $-4096,%r11
480         leaq    (%r11,%rbp,1),%rsp
481         movq    (%rsp),%r10
482         cmpq    %rbp,%rsp
483         ja      .Lmul4x_page_walk
484         jmp     .Lmul4x_page_walk_done
485
486 .Lmul4x_page_walk:
487         leaq    -4096(%rsp),%rsp
488         movq    (%rsp),%r10
489         cmpq    %rbp,%rsp
490         ja      .Lmul4x_page_walk
491 .Lmul4x_page_walk_done:
492
493         negq    %r9
494
495         movq    %rax,40(%rsp)
496 .Lmul4x_body:
497
498         call    mul4x_internal
499
500         movq    40(%rsp),%rsi
501         movq    $1,%rax
502
503         movq    -48(%rsi),%r15
504         movq    -40(%rsi),%r14
505         movq    -32(%rsi),%r13
506         movq    -24(%rsi),%r12
507         movq    -16(%rsi),%rbp
508         movq    -8(%rsi),%rbx
509         leaq    (%rsi),%rsp
510 .Lmul4x_epilogue:
511         .byte   0xf3,0xc3
512 .size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
513
514 .type   mul4x_internal,@function
515 .align  32
516 mul4x_internal:
517         shlq    $5,%r9
518         movd    8(%rax),%xmm5
519         leaq    .Linc(%rip),%rax
520         leaq    128(%rdx,%r9,1),%r13
521         shrq    $5,%r9
522         movdqa  0(%rax),%xmm0
523         movdqa  16(%rax),%xmm1
524         leaq    88-112(%rsp,%r9,1),%r10
525         leaq    128(%rdx),%r12
526
527         pshufd  $0,%xmm5,%xmm5
528         movdqa  %xmm1,%xmm4
529 .byte   0x67,0x67
530         movdqa  %xmm1,%xmm2
531         paddd   %xmm0,%xmm1
532         pcmpeqd %xmm5,%xmm0
533 .byte   0x67
534         movdqa  %xmm4,%xmm3
535         paddd   %xmm1,%xmm2
536         pcmpeqd %xmm5,%xmm1
537         movdqa  %xmm0,112(%r10)
538         movdqa  %xmm4,%xmm0
539
540         paddd   %xmm2,%xmm3
541         pcmpeqd %xmm5,%xmm2
542         movdqa  %xmm1,128(%r10)
543         movdqa  %xmm4,%xmm1
544
545         paddd   %xmm3,%xmm0
546         pcmpeqd %xmm5,%xmm3
547         movdqa  %xmm2,144(%r10)
548         movdqa  %xmm4,%xmm2
549
550         paddd   %xmm0,%xmm1
551         pcmpeqd %xmm5,%xmm0
552         movdqa  %xmm3,160(%r10)
553         movdqa  %xmm4,%xmm3
554         paddd   %xmm1,%xmm2
555         pcmpeqd %xmm5,%xmm1
556         movdqa  %xmm0,176(%r10)
557         movdqa  %xmm4,%xmm0
558
559         paddd   %xmm2,%xmm3
560         pcmpeqd %xmm5,%xmm2
561         movdqa  %xmm1,192(%r10)
562         movdqa  %xmm4,%xmm1
563
564         paddd   %xmm3,%xmm0
565         pcmpeqd %xmm5,%xmm3
566         movdqa  %xmm2,208(%r10)
567         movdqa  %xmm4,%xmm2
568
569         paddd   %xmm0,%xmm1
570         pcmpeqd %xmm5,%xmm0
571         movdqa  %xmm3,224(%r10)
572         movdqa  %xmm4,%xmm3
573         paddd   %xmm1,%xmm2
574         pcmpeqd %xmm5,%xmm1
575         movdqa  %xmm0,240(%r10)
576         movdqa  %xmm4,%xmm0
577
578         paddd   %xmm2,%xmm3
579         pcmpeqd %xmm5,%xmm2
580         movdqa  %xmm1,256(%r10)
581         movdqa  %xmm4,%xmm1
582
583         paddd   %xmm3,%xmm0
584         pcmpeqd %xmm5,%xmm3
585         movdqa  %xmm2,272(%r10)
586         movdqa  %xmm4,%xmm2
587
588         paddd   %xmm0,%xmm1
589         pcmpeqd %xmm5,%xmm0
590         movdqa  %xmm3,288(%r10)
591         movdqa  %xmm4,%xmm3
592         paddd   %xmm1,%xmm2
593         pcmpeqd %xmm5,%xmm1
594         movdqa  %xmm0,304(%r10)
595
596         paddd   %xmm2,%xmm3
597 .byte   0x67
598         pcmpeqd %xmm5,%xmm2
599         movdqa  %xmm1,320(%r10)
600
601         pcmpeqd %xmm5,%xmm3
602         movdqa  %xmm2,336(%r10)
603         pand    64(%r12),%xmm0
604
605         pand    80(%r12),%xmm1
606         pand    96(%r12),%xmm2
607         movdqa  %xmm3,352(%r10)
608         pand    112(%r12),%xmm3
609         por     %xmm2,%xmm0
610         por     %xmm3,%xmm1
611         movdqa  -128(%r12),%xmm4
612         movdqa  -112(%r12),%xmm5
613         movdqa  -96(%r12),%xmm2
614         pand    112(%r10),%xmm4
615         movdqa  -80(%r12),%xmm3
616         pand    128(%r10),%xmm5
617         por     %xmm4,%xmm0
618         pand    144(%r10),%xmm2
619         por     %xmm5,%xmm1
620         pand    160(%r10),%xmm3
621         por     %xmm2,%xmm0
622         por     %xmm3,%xmm1
623         movdqa  -64(%r12),%xmm4
624         movdqa  -48(%r12),%xmm5
625         movdqa  -32(%r12),%xmm2
626         pand    176(%r10),%xmm4
627         movdqa  -16(%r12),%xmm3
628         pand    192(%r10),%xmm5
629         por     %xmm4,%xmm0
630         pand    208(%r10),%xmm2
631         por     %xmm5,%xmm1
632         pand    224(%r10),%xmm3
633         por     %xmm2,%xmm0
634         por     %xmm3,%xmm1
635         movdqa  0(%r12),%xmm4
636         movdqa  16(%r12),%xmm5
637         movdqa  32(%r12),%xmm2
638         pand    240(%r10),%xmm4
639         movdqa  48(%r12),%xmm3
640         pand    256(%r10),%xmm5
641         por     %xmm4,%xmm0
642         pand    272(%r10),%xmm2
643         por     %xmm5,%xmm1
644         pand    288(%r10),%xmm3
645         por     %xmm2,%xmm0
646         por     %xmm3,%xmm1
647         por     %xmm1,%xmm0
648         pshufd  $0x4e,%xmm0,%xmm1
649         por     %xmm1,%xmm0
650         leaq    256(%r12),%r12
651 .byte   102,72,15,126,195
652
653         movq    %r13,16+8(%rsp)
654         movq    %rdi,56+8(%rsp)
655
656         movq    (%r8),%r8
657         movq    (%rsi),%rax
658         leaq    (%rsi,%r9,1),%rsi
659         negq    %r9
660
661         movq    %r8,%rbp
662         mulq    %rbx
663         movq    %rax,%r10
664         movq    (%rcx),%rax
665
666         imulq   %r10,%rbp
667         leaq    64+8(%rsp),%r14
668         movq    %rdx,%r11
669
670         mulq    %rbp
671         addq    %rax,%r10
672         movq    8(%rsi,%r9,1),%rax
673         adcq    $0,%rdx
674         movq    %rdx,%rdi
675
676         mulq    %rbx
677         addq    %rax,%r11
678         movq    8(%rcx),%rax
679         adcq    $0,%rdx
680         movq    %rdx,%r10
681
682         mulq    %rbp
683         addq    %rax,%rdi
684         movq    16(%rsi,%r9,1),%rax
685         adcq    $0,%rdx
686         addq    %r11,%rdi
687         leaq    32(%r9),%r15
688         leaq    32(%rcx),%rcx
689         adcq    $0,%rdx
690         movq    %rdi,(%r14)
691         movq    %rdx,%r13
692         jmp     .L1st4x
693
694 .align  32
695 .L1st4x:
696         mulq    %rbx
697         addq    %rax,%r10
698         movq    -16(%rcx),%rax
699         leaq    32(%r14),%r14
700         adcq    $0,%rdx
701         movq    %rdx,%r11
702
703         mulq    %rbp
704         addq    %rax,%r13
705         movq    -8(%rsi,%r15,1),%rax
706         adcq    $0,%rdx
707         addq    %r10,%r13
708         adcq    $0,%rdx
709         movq    %r13,-24(%r14)
710         movq    %rdx,%rdi
711
712         mulq    %rbx
713         addq    %rax,%r11
714         movq    -8(%rcx),%rax
715         adcq    $0,%rdx
716         movq    %rdx,%r10
717
718         mulq    %rbp
719         addq    %rax,%rdi
720         movq    (%rsi,%r15,1),%rax
721         adcq    $0,%rdx
722         addq    %r11,%rdi
723         adcq    $0,%rdx
724         movq    %rdi,-16(%r14)
725         movq    %rdx,%r13
726
727         mulq    %rbx
728         addq    %rax,%r10
729         movq    0(%rcx),%rax
730         adcq    $0,%rdx
731         movq    %rdx,%r11
732
733         mulq    %rbp
734         addq    %rax,%r13
735         movq    8(%rsi,%r15,1),%rax
736         adcq    $0,%rdx
737         addq    %r10,%r13
738         adcq    $0,%rdx
739         movq    %r13,-8(%r14)
740         movq    %rdx,%rdi
741
742         mulq    %rbx
743         addq    %rax,%r11
744         movq    8(%rcx),%rax
745         adcq    $0,%rdx
746         movq    %rdx,%r10
747
748         mulq    %rbp
749         addq    %rax,%rdi
750         movq    16(%rsi,%r15,1),%rax
751         adcq    $0,%rdx
752         addq    %r11,%rdi
753         leaq    32(%rcx),%rcx
754         adcq    $0,%rdx
755         movq    %rdi,(%r14)
756         movq    %rdx,%r13
757
758         addq    $32,%r15
759         jnz     .L1st4x
760
761         mulq    %rbx
762         addq    %rax,%r10
763         movq    -16(%rcx),%rax
764         leaq    32(%r14),%r14
765         adcq    $0,%rdx
766         movq    %rdx,%r11
767
768         mulq    %rbp
769         addq    %rax,%r13
770         movq    -8(%rsi),%rax
771         adcq    $0,%rdx
772         addq    %r10,%r13
773         adcq    $0,%rdx
774         movq    %r13,-24(%r14)
775         movq    %rdx,%rdi
776
777         mulq    %rbx
778         addq    %rax,%r11
779         movq    -8(%rcx),%rax
780         adcq    $0,%rdx
781         movq    %rdx,%r10
782
783         mulq    %rbp
784         addq    %rax,%rdi
785         movq    (%rsi,%r9,1),%rax
786         adcq    $0,%rdx
787         addq    %r11,%rdi
788         adcq    $0,%rdx
789         movq    %rdi,-16(%r14)
790         movq    %rdx,%r13
791
792         leaq    (%rcx,%r9,1),%rcx
793
794         xorq    %rdi,%rdi
795         addq    %r10,%r13
796         adcq    $0,%rdi
797         movq    %r13,-8(%r14)
798
799         jmp     .Louter4x
800
801 .align  32
802 .Louter4x:
803         leaq    16+128(%r14),%rdx
804         pxor    %xmm4,%xmm4
805         pxor    %xmm5,%xmm5
806         movdqa  -128(%r12),%xmm0
807         movdqa  -112(%r12),%xmm1
808         movdqa  -96(%r12),%xmm2
809         movdqa  -80(%r12),%xmm3
810         pand    -128(%rdx),%xmm0
811         pand    -112(%rdx),%xmm1
812         por     %xmm0,%xmm4
813         pand    -96(%rdx),%xmm2
814         por     %xmm1,%xmm5
815         pand    -80(%rdx),%xmm3
816         por     %xmm2,%xmm4
817         por     %xmm3,%xmm5
818         movdqa  -64(%r12),%xmm0
819         movdqa  -48(%r12),%xmm1
820         movdqa  -32(%r12),%xmm2
821         movdqa  -16(%r12),%xmm3
822         pand    -64(%rdx),%xmm0
823         pand    -48(%rdx),%xmm1
824         por     %xmm0,%xmm4
825         pand    -32(%rdx),%xmm2
826         por     %xmm1,%xmm5
827         pand    -16(%rdx),%xmm3
828         por     %xmm2,%xmm4
829         por     %xmm3,%xmm5
830         movdqa  0(%r12),%xmm0
831         movdqa  16(%r12),%xmm1
832         movdqa  32(%r12),%xmm2
833         movdqa  48(%r12),%xmm3
834         pand    0(%rdx),%xmm0
835         pand    16(%rdx),%xmm1
836         por     %xmm0,%xmm4
837         pand    32(%rdx),%xmm2
838         por     %xmm1,%xmm5
839         pand    48(%rdx),%xmm3
840         por     %xmm2,%xmm4
841         por     %xmm3,%xmm5
842         movdqa  64(%r12),%xmm0
843         movdqa  80(%r12),%xmm1
844         movdqa  96(%r12),%xmm2
845         movdqa  112(%r12),%xmm3
846         pand    64(%rdx),%xmm0
847         pand    80(%rdx),%xmm1
848         por     %xmm0,%xmm4
849         pand    96(%rdx),%xmm2
850         por     %xmm1,%xmm5
851         pand    112(%rdx),%xmm3
852         por     %xmm2,%xmm4
853         por     %xmm3,%xmm5
854         por     %xmm5,%xmm4
855         pshufd  $0x4e,%xmm4,%xmm0
856         por     %xmm4,%xmm0
857         leaq    256(%r12),%r12
858 .byte   102,72,15,126,195
859
860         movq    (%r14,%r9,1),%r10
861         movq    %r8,%rbp
862         mulq    %rbx
863         addq    %rax,%r10
864         movq    (%rcx),%rax
865         adcq    $0,%rdx
866
867         imulq   %r10,%rbp
868         movq    %rdx,%r11
869         movq    %rdi,(%r14)
870
871         leaq    (%r14,%r9,1),%r14
872
873         mulq    %rbp
874         addq    %rax,%r10
875         movq    8(%rsi,%r9,1),%rax
876         adcq    $0,%rdx
877         movq    %rdx,%rdi
878
879         mulq    %rbx
880         addq    %rax,%r11
881         movq    8(%rcx),%rax
882         adcq    $0,%rdx
883         addq    8(%r14),%r11
884         adcq    $0,%rdx
885         movq    %rdx,%r10
886
887         mulq    %rbp
888         addq    %rax,%rdi
889         movq    16(%rsi,%r9,1),%rax
890         adcq    $0,%rdx
891         addq    %r11,%rdi
892         leaq    32(%r9),%r15
893         leaq    32(%rcx),%rcx
894         adcq    $0,%rdx
895         movq    %rdx,%r13
896         jmp     .Linner4x
897
898 .align  32
899 .Linner4x:
900         mulq    %rbx
901         addq    %rax,%r10
902         movq    -16(%rcx),%rax
903         adcq    $0,%rdx
904         addq    16(%r14),%r10
905         leaq    32(%r14),%r14
906         adcq    $0,%rdx
907         movq    %rdx,%r11
908
909         mulq    %rbp
910         addq    %rax,%r13
911         movq    -8(%rsi,%r15,1),%rax
912         adcq    $0,%rdx
913         addq    %r10,%r13
914         adcq    $0,%rdx
915         movq    %rdi,-32(%r14)
916         movq    %rdx,%rdi
917
918         mulq    %rbx
919         addq    %rax,%r11
920         movq    -8(%rcx),%rax
921         adcq    $0,%rdx
922         addq    -8(%r14),%r11
923         adcq    $0,%rdx
924         movq    %rdx,%r10
925
926         mulq    %rbp
927         addq    %rax,%rdi
928         movq    (%rsi,%r15,1),%rax
929         adcq    $0,%rdx
930         addq    %r11,%rdi
931         adcq    $0,%rdx
932         movq    %r13,-24(%r14)
933         movq    %rdx,%r13
934
935         mulq    %rbx
936         addq    %rax,%r10
937         movq    0(%rcx),%rax
938         adcq    $0,%rdx
939         addq    (%r14),%r10
940         adcq    $0,%rdx
941         movq    %rdx,%r11
942
943         mulq    %rbp
944         addq    %rax,%r13
945         movq    8(%rsi,%r15,1),%rax
946         adcq    $0,%rdx
947         addq    %r10,%r13
948         adcq    $0,%rdx
949         movq    %rdi,-16(%r14)
950         movq    %rdx,%rdi
951
952         mulq    %rbx
953         addq    %rax,%r11
954         movq    8(%rcx),%rax
955         adcq    $0,%rdx
956         addq    8(%r14),%r11
957         adcq    $0,%rdx
958         movq    %rdx,%r10
959
960         mulq    %rbp
961         addq    %rax,%rdi
962         movq    16(%rsi,%r15,1),%rax
963         adcq    $0,%rdx
964         addq    %r11,%rdi
965         leaq    32(%rcx),%rcx
966         adcq    $0,%rdx
967         movq    %r13,-8(%r14)
968         movq    %rdx,%r13
969
970         addq    $32,%r15
971         jnz     .Linner4x
972
973         mulq    %rbx
974         addq    %rax,%r10
975         movq    -16(%rcx),%rax
976         adcq    $0,%rdx
977         addq    16(%r14),%r10
978         leaq    32(%r14),%r14
979         adcq    $0,%rdx
980         movq    %rdx,%r11
981
982         mulq    %rbp
983         addq    %rax,%r13
984         movq    -8(%rsi),%rax
985         adcq    $0,%rdx
986         addq    %r10,%r13
987         adcq    $0,%rdx
988         movq    %rdi,-32(%r14)
989         movq    %rdx,%rdi
990
991         mulq    %rbx
992         addq    %rax,%r11
993         movq    %rbp,%rax
994         movq    -8(%rcx),%rbp
995         adcq    $0,%rdx
996         addq    -8(%r14),%r11
997         adcq    $0,%rdx
998         movq    %rdx,%r10
999
1000         mulq    %rbp
1001         addq    %rax,%rdi
1002         movq    (%rsi,%r9,1),%rax
1003         adcq    $0,%rdx
1004         addq    %r11,%rdi
1005         adcq    $0,%rdx
1006         movq    %r13,-24(%r14)
1007         movq    %rdx,%r13
1008
1009         movq    %rdi,-16(%r14)
1010         leaq    (%rcx,%r9,1),%rcx
1011
1012         xorq    %rdi,%rdi
1013         addq    %r10,%r13
1014         adcq    $0,%rdi
1015         addq    (%r14),%r13
1016         adcq    $0,%rdi
1017         movq    %r13,-8(%r14)
1018
1019         cmpq    16+8(%rsp),%r12
1020         jb      .Louter4x
1021         xorq    %rax,%rax
1022         subq    %r13,%rbp
1023         adcq    %r15,%r15
1024         orq     %r15,%rdi
1025         subq    %rdi,%rax
1026         leaq    (%r14,%r9,1),%rbx
1027         movq    (%rcx),%r12
1028         leaq    (%rcx),%rbp
1029         movq    %r9,%rcx
1030         sarq    $3+2,%rcx
1031         movq    56+8(%rsp),%rdi
1032         decq    %r12
1033         xorq    %r10,%r10
1034         movq    8(%rbp),%r13
1035         movq    16(%rbp),%r14
1036         movq    24(%rbp),%r15
1037         jmp     .Lsqr4x_sub_entry
1038 .size   mul4x_internal,.-mul4x_internal
1039 .globl  bn_power5
1040 .type   bn_power5,@function
1041 .align  32
1042 bn_power5:
1043         movq    %rsp,%rax
1044         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
1045         andl    $0x80108,%r11d
1046         cmpl    $0x80108,%r11d
1047         je      .Lpowerx5_enter
1048         pushq   %rbx
1049         pushq   %rbp
1050         pushq   %r12
1051         pushq   %r13
1052         pushq   %r14
1053         pushq   %r15
1054 .Lpower5_prologue:
1055
1056         shll    $3,%r9d
1057         leal    (%r9,%r9,2),%r10d
1058         negq    %r9
1059         movq    (%r8),%r8
1060
1061
1062
1063
1064
1065
1066
1067
1068         leaq    -320(%rsp,%r9,2),%r11
1069         movq    %rsp,%rbp
1070         subq    %rdi,%r11
1071         andq    $4095,%r11
1072         cmpq    %r11,%r10
1073         jb      .Lpwr_sp_alt
1074         subq    %r11,%rbp
1075         leaq    -320(%rbp,%r9,2),%rbp
1076         jmp     .Lpwr_sp_done
1077
1078 .align  32
1079 .Lpwr_sp_alt:
1080         leaq    4096-320(,%r9,2),%r10
1081         leaq    -320(%rbp,%r9,2),%rbp
1082         subq    %r10,%r11
1083         movq    $0,%r10
1084         cmovcq  %r10,%r11
1085         subq    %r11,%rbp
1086 .Lpwr_sp_done:
1087         andq    $-64,%rbp
1088         movq    %rsp,%r11
1089         subq    %rbp,%r11
1090         andq    $-4096,%r11
1091         leaq    (%r11,%rbp,1),%rsp
1092         movq    (%rsp),%r10
1093         cmpq    %rbp,%rsp
1094         ja      .Lpwr_page_walk
1095         jmp     .Lpwr_page_walk_done
1096
1097 .Lpwr_page_walk:
1098         leaq    -4096(%rsp),%rsp
1099         movq    (%rsp),%r10
1100         cmpq    %rbp,%rsp
1101         ja      .Lpwr_page_walk
1102 .Lpwr_page_walk_done:
1103
1104         movq    %r9,%r10
1105         negq    %r9
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116         movq    %r8,32(%rsp)
1117         movq    %rax,40(%rsp)
1118 .Lpower5_body:
1119 .byte   102,72,15,110,207
1120 .byte   102,72,15,110,209
1121 .byte   102,73,15,110,218
1122 .byte   102,72,15,110,226
1123
1124         call    __bn_sqr8x_internal
1125         call    __bn_post4x_internal
1126         call    __bn_sqr8x_internal
1127         call    __bn_post4x_internal
1128         call    __bn_sqr8x_internal
1129         call    __bn_post4x_internal
1130         call    __bn_sqr8x_internal
1131         call    __bn_post4x_internal
1132         call    __bn_sqr8x_internal
1133         call    __bn_post4x_internal
1134
1135 .byte   102,72,15,126,209
1136 .byte   102,72,15,126,226
1137         movq    %rsi,%rdi
1138         movq    40(%rsp),%rax
1139         leaq    32(%rsp),%r8
1140
1141         call    mul4x_internal
1142
1143         movq    40(%rsp),%rsi
1144         movq    $1,%rax
1145         movq    -48(%rsi),%r15
1146         movq    -40(%rsi),%r14
1147         movq    -32(%rsi),%r13
1148         movq    -24(%rsi),%r12
1149         movq    -16(%rsi),%rbp
1150         movq    -8(%rsi),%rbx
1151         leaq    (%rsi),%rsp
1152 .Lpower5_epilogue:
1153         .byte   0xf3,0xc3
1154 .size   bn_power5,.-bn_power5
1155
1156 .globl  bn_sqr8x_internal
1157 .hidden bn_sqr8x_internal
1158 .type   bn_sqr8x_internal,@function
1159 .align  32
1160 bn_sqr8x_internal:
1161 __bn_sqr8x_internal:
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235         leaq    32(%r10),%rbp
1236         leaq    (%rsi,%r9,1),%rsi
1237
1238         movq    %r9,%rcx
1239
1240
1241         movq    -32(%rsi,%rbp,1),%r14
1242         leaq    48+8(%rsp,%r9,2),%rdi
1243         movq    -24(%rsi,%rbp,1),%rax
1244         leaq    -32(%rdi,%rbp,1),%rdi
1245         movq    -16(%rsi,%rbp,1),%rbx
1246         movq    %rax,%r15
1247
1248         mulq    %r14
1249         movq    %rax,%r10
1250         movq    %rbx,%rax
1251         movq    %rdx,%r11
1252         movq    %r10,-24(%rdi,%rbp,1)
1253
1254         mulq    %r14
1255         addq    %rax,%r11
1256         movq    %rbx,%rax
1257         adcq    $0,%rdx
1258         movq    %r11,-16(%rdi,%rbp,1)
1259         movq    %rdx,%r10
1260
1261
1262         movq    -8(%rsi,%rbp,1),%rbx
1263         mulq    %r15
1264         movq    %rax,%r12
1265         movq    %rbx,%rax
1266         movq    %rdx,%r13
1267
1268         leaq    (%rbp),%rcx
1269         mulq    %r14
1270         addq    %rax,%r10
1271         movq    %rbx,%rax
1272         movq    %rdx,%r11
1273         adcq    $0,%r11
1274         addq    %r12,%r10
1275         adcq    $0,%r11
1276         movq    %r10,-8(%rdi,%rcx,1)
1277         jmp     .Lsqr4x_1st
1278
1279 .align  32
1280 .Lsqr4x_1st:
1281         movq    (%rsi,%rcx,1),%rbx
1282         mulq    %r15
1283         addq    %rax,%r13
1284         movq    %rbx,%rax
1285         movq    %rdx,%r12
1286         adcq    $0,%r12
1287
1288         mulq    %r14
1289         addq    %rax,%r11
1290         movq    %rbx,%rax
1291         movq    8(%rsi,%rcx,1),%rbx
1292         movq    %rdx,%r10
1293         adcq    $0,%r10
1294         addq    %r13,%r11
1295         adcq    $0,%r10
1296
1297
1298         mulq    %r15
1299         addq    %rax,%r12
1300         movq    %rbx,%rax
1301         movq    %r11,(%rdi,%rcx,1)
1302         movq    %rdx,%r13
1303         adcq    $0,%r13
1304
1305         mulq    %r14
1306         addq    %rax,%r10
1307         movq    %rbx,%rax
1308         movq    16(%rsi,%rcx,1),%rbx
1309         movq    %rdx,%r11
1310         adcq    $0,%r11
1311         addq    %r12,%r10
1312         adcq    $0,%r11
1313
1314         mulq    %r15
1315         addq    %rax,%r13
1316         movq    %rbx,%rax
1317         movq    %r10,8(%rdi,%rcx,1)
1318         movq    %rdx,%r12
1319         adcq    $0,%r12
1320
1321         mulq    %r14
1322         addq    %rax,%r11
1323         movq    %rbx,%rax
1324         movq    24(%rsi,%rcx,1),%rbx
1325         movq    %rdx,%r10
1326         adcq    $0,%r10
1327         addq    %r13,%r11
1328         adcq    $0,%r10
1329
1330
1331         mulq    %r15
1332         addq    %rax,%r12
1333         movq    %rbx,%rax
1334         movq    %r11,16(%rdi,%rcx,1)
1335         movq    %rdx,%r13
1336         adcq    $0,%r13
1337         leaq    32(%rcx),%rcx
1338
1339         mulq    %r14
1340         addq    %rax,%r10
1341         movq    %rbx,%rax
1342         movq    %rdx,%r11
1343         adcq    $0,%r11
1344         addq    %r12,%r10
1345         adcq    $0,%r11
1346         movq    %r10,-8(%rdi,%rcx,1)
1347
1348         cmpq    $0,%rcx
1349         jne     .Lsqr4x_1st
1350
1351         mulq    %r15
1352         addq    %rax,%r13
1353         leaq    16(%rbp),%rbp
1354         adcq    $0,%rdx
1355         addq    %r11,%r13
1356         adcq    $0,%rdx
1357
1358         movq    %r13,(%rdi)
1359         movq    %rdx,%r12
1360         movq    %rdx,8(%rdi)
1361         jmp     .Lsqr4x_outer
1362
1363 .align  32
1364 .Lsqr4x_outer:
1365         movq    -32(%rsi,%rbp,1),%r14
1366         leaq    48+8(%rsp,%r9,2),%rdi
1367         movq    -24(%rsi,%rbp,1),%rax
1368         leaq    -32(%rdi,%rbp,1),%rdi
1369         movq    -16(%rsi,%rbp,1),%rbx
1370         movq    %rax,%r15
1371
1372         mulq    %r14
1373         movq    -24(%rdi,%rbp,1),%r10
1374         addq    %rax,%r10
1375         movq    %rbx,%rax
1376         adcq    $0,%rdx
1377         movq    %r10,-24(%rdi,%rbp,1)
1378         movq    %rdx,%r11
1379
1380         mulq    %r14
1381         addq    %rax,%r11
1382         movq    %rbx,%rax
1383         adcq    $0,%rdx
1384         addq    -16(%rdi,%rbp,1),%r11
1385         movq    %rdx,%r10
1386         adcq    $0,%r10
1387         movq    %r11,-16(%rdi,%rbp,1)
1388
1389         xorq    %r12,%r12
1390
1391         movq    -8(%rsi,%rbp,1),%rbx
1392         mulq    %r15
1393         addq    %rax,%r12
1394         movq    %rbx,%rax
1395         adcq    $0,%rdx
1396         addq    -8(%rdi,%rbp,1),%r12
1397         movq    %rdx,%r13
1398         adcq    $0,%r13
1399
1400         mulq    %r14
1401         addq    %rax,%r10
1402         movq    %rbx,%rax
1403         adcq    $0,%rdx
1404         addq    %r12,%r10
1405         movq    %rdx,%r11
1406         adcq    $0,%r11
1407         movq    %r10,-8(%rdi,%rbp,1)
1408
1409         leaq    (%rbp),%rcx
1410         jmp     .Lsqr4x_inner
1411
1412 .align  32
1413 .Lsqr4x_inner:
1414         movq    (%rsi,%rcx,1),%rbx
1415         mulq    %r15
1416         addq    %rax,%r13
1417         movq    %rbx,%rax
1418         movq    %rdx,%r12
1419         adcq    $0,%r12
1420         addq    (%rdi,%rcx,1),%r13
1421         adcq    $0,%r12
1422
1423 .byte   0x67
1424         mulq    %r14
1425         addq    %rax,%r11
1426         movq    %rbx,%rax
1427         movq    8(%rsi,%rcx,1),%rbx
1428         movq    %rdx,%r10
1429         adcq    $0,%r10
1430         addq    %r13,%r11
1431         adcq    $0,%r10
1432
1433         mulq    %r15
1434         addq    %rax,%r12
1435         movq    %r11,(%rdi,%rcx,1)
1436         movq    %rbx,%rax
1437         movq    %rdx,%r13
1438         adcq    $0,%r13
1439         addq    8(%rdi,%rcx,1),%r12
1440         leaq    16(%rcx),%rcx
1441         adcq    $0,%r13
1442
1443         mulq    %r14
1444         addq    %rax,%r10
1445         movq    %rbx,%rax
1446         adcq    $0,%rdx
1447         addq    %r12,%r10
1448         movq    %rdx,%r11
1449         adcq    $0,%r11
1450         movq    %r10,-8(%rdi,%rcx,1)
1451
1452         cmpq    $0,%rcx
1453         jne     .Lsqr4x_inner
1454
1455 .byte   0x67
1456         mulq    %r15
1457         addq    %rax,%r13
1458         adcq    $0,%rdx
1459         addq    %r11,%r13
1460         adcq    $0,%rdx
1461
1462         movq    %r13,(%rdi)
1463         movq    %rdx,%r12
1464         movq    %rdx,8(%rdi)
1465
1466         addq    $16,%rbp
1467         jnz     .Lsqr4x_outer
1468
1469
1470         movq    -32(%rsi),%r14
1471         leaq    48+8(%rsp,%r9,2),%rdi
1472         movq    -24(%rsi),%rax
1473         leaq    -32(%rdi,%rbp,1),%rdi
1474         movq    -16(%rsi),%rbx
1475         movq    %rax,%r15
1476
1477         mulq    %r14
1478         addq    %rax,%r10
1479         movq    %rbx,%rax
1480         movq    %rdx,%r11
1481         adcq    $0,%r11
1482
1483         mulq    %r14
1484         addq    %rax,%r11
1485         movq    %rbx,%rax
1486         movq    %r10,-24(%rdi)
1487         movq    %rdx,%r10
1488         adcq    $0,%r10
1489         addq    %r13,%r11
1490         movq    -8(%rsi),%rbx
1491         adcq    $0,%r10
1492
1493         mulq    %r15
1494         addq    %rax,%r12
1495         movq    %rbx,%rax
1496         movq    %r11,-16(%rdi)
1497         movq    %rdx,%r13
1498         adcq    $0,%r13
1499
1500         mulq    %r14
1501         addq    %rax,%r10
1502         movq    %rbx,%rax
1503         movq    %rdx,%r11
1504         adcq    $0,%r11
1505         addq    %r12,%r10
1506         adcq    $0,%r11
1507         movq    %r10,-8(%rdi)
1508
1509         mulq    %r15
1510         addq    %rax,%r13
1511         movq    -16(%rsi),%rax
1512         adcq    $0,%rdx
1513         addq    %r11,%r13
1514         adcq    $0,%rdx
1515
1516         movq    %r13,(%rdi)
1517         movq    %rdx,%r12
1518         movq    %rdx,8(%rdi)
1519
1520         mulq    %rbx
1521         addq    $16,%rbp
1522         xorq    %r14,%r14
1523         subq    %r9,%rbp
1524         xorq    %r15,%r15
1525
1526         addq    %r12,%rax
1527         adcq    $0,%rdx
1528         movq    %rax,8(%rdi)
1529         movq    %rdx,16(%rdi)
1530         movq    %r15,24(%rdi)
1531
1532         movq    -16(%rsi,%rbp,1),%rax
1533         leaq    48+8(%rsp),%rdi
1534         xorq    %r10,%r10
1535         movq    8(%rdi),%r11
1536
1537         leaq    (%r14,%r10,2),%r12
1538         shrq    $63,%r10
1539         leaq    (%rcx,%r11,2),%r13
1540         shrq    $63,%r11
1541         orq     %r10,%r13
1542         movq    16(%rdi),%r10
1543         movq    %r11,%r14
1544         mulq    %rax
1545         negq    %r15
1546         movq    24(%rdi),%r11
1547         adcq    %rax,%r12
1548         movq    -8(%rsi,%rbp,1),%rax
1549         movq    %r12,(%rdi)
1550         adcq    %rdx,%r13
1551
1552         leaq    (%r14,%r10,2),%rbx
1553         movq    %r13,8(%rdi)
1554         sbbq    %r15,%r15
1555         shrq    $63,%r10
1556         leaq    (%rcx,%r11,2),%r8
1557         shrq    $63,%r11
1558         orq     %r10,%r8
1559         movq    32(%rdi),%r10
1560         movq    %r11,%r14
1561         mulq    %rax
1562         negq    %r15
1563         movq    40(%rdi),%r11
1564         adcq    %rax,%rbx
1565         movq    0(%rsi,%rbp,1),%rax
1566         movq    %rbx,16(%rdi)
1567         adcq    %rdx,%r8
1568         leaq    16(%rbp),%rbp
1569         movq    %r8,24(%rdi)
1570         sbbq    %r15,%r15
1571         leaq    64(%rdi),%rdi
1572         jmp     .Lsqr4x_shift_n_add
1573
1574 .align  32
1575 .Lsqr4x_shift_n_add:
1576         leaq    (%r14,%r10,2),%r12
1577         shrq    $63,%r10
1578         leaq    (%rcx,%r11,2),%r13
1579         shrq    $63,%r11
1580         orq     %r10,%r13
1581         movq    -16(%rdi),%r10
1582         movq    %r11,%r14
1583         mulq    %rax
1584         negq    %r15
1585         movq    -8(%rdi),%r11
1586         adcq    %rax,%r12
1587         movq    -8(%rsi,%rbp,1),%rax
1588         movq    %r12,-32(%rdi)
1589         adcq    %rdx,%r13
1590
1591         leaq    (%r14,%r10,2),%rbx
1592         movq    %r13,-24(%rdi)
1593         sbbq    %r15,%r15
1594         shrq    $63,%r10
1595         leaq    (%rcx,%r11,2),%r8
1596         shrq    $63,%r11
1597         orq     %r10,%r8
1598         movq    0(%rdi),%r10
1599         movq    %r11,%r14
1600         mulq    %rax
1601         negq    %r15
1602         movq    8(%rdi),%r11
1603         adcq    %rax,%rbx
1604         movq    0(%rsi,%rbp,1),%rax
1605         movq    %rbx,-16(%rdi)
1606         adcq    %rdx,%r8
1607
1608         leaq    (%r14,%r10,2),%r12
1609         movq    %r8,-8(%rdi)
1610         sbbq    %r15,%r15
1611         shrq    $63,%r10
1612         leaq    (%rcx,%r11,2),%r13
1613         shrq    $63,%r11
1614         orq     %r10,%r13
1615         movq    16(%rdi),%r10
1616         movq    %r11,%r14
1617         mulq    %rax
1618         negq    %r15
1619         movq    24(%rdi),%r11
1620         adcq    %rax,%r12
1621         movq    8(%rsi,%rbp,1),%rax
1622         movq    %r12,0(%rdi)
1623         adcq    %rdx,%r13
1624
1625         leaq    (%r14,%r10,2),%rbx
1626         movq    %r13,8(%rdi)
1627         sbbq    %r15,%r15
1628         shrq    $63,%r10
1629         leaq    (%rcx,%r11,2),%r8
1630         shrq    $63,%r11
1631         orq     %r10,%r8
1632         movq    32(%rdi),%r10
1633         movq    %r11,%r14
1634         mulq    %rax
1635         negq    %r15
1636         movq    40(%rdi),%r11
1637         adcq    %rax,%rbx
1638         movq    16(%rsi,%rbp,1),%rax
1639         movq    %rbx,16(%rdi)
1640         adcq    %rdx,%r8
1641         movq    %r8,24(%rdi)
1642         sbbq    %r15,%r15
1643         leaq    64(%rdi),%rdi
1644         addq    $32,%rbp
1645         jnz     .Lsqr4x_shift_n_add
1646
1647         leaq    (%r14,%r10,2),%r12
1648 .byte   0x67
1649         shrq    $63,%r10
1650         leaq    (%rcx,%r11,2),%r13
1651         shrq    $63,%r11
1652         orq     %r10,%r13
1653         movq    -16(%rdi),%r10
1654         movq    %r11,%r14
1655         mulq    %rax
1656         negq    %r15
1657         movq    -8(%rdi),%r11
1658         adcq    %rax,%r12
1659         movq    -8(%rsi),%rax
1660         movq    %r12,-32(%rdi)
1661         adcq    %rdx,%r13
1662
1663         leaq    (%r14,%r10,2),%rbx
1664         movq    %r13,-24(%rdi)
1665         sbbq    %r15,%r15
1666         shrq    $63,%r10
1667         leaq    (%rcx,%r11,2),%r8
1668         shrq    $63,%r11
1669         orq     %r10,%r8
1670         mulq    %rax
1671         negq    %r15
1672         adcq    %rax,%rbx
1673         adcq    %rdx,%r8
1674         movq    %rbx,-16(%rdi)
1675         movq    %r8,-8(%rdi)
1676 .byte   102,72,15,126,213
1677 __bn_sqr8x_reduction:
1678         xorq    %rax,%rax
1679         leaq    (%r9,%rbp,1),%rcx
1680         leaq    48+8(%rsp,%r9,2),%rdx
1681         movq    %rcx,0+8(%rsp)
1682         leaq    48+8(%rsp,%r9,1),%rdi
1683         movq    %rdx,8+8(%rsp)
1684         negq    %r9
1685         jmp     .L8x_reduction_loop
1686
1687 .align  32
1688 .L8x_reduction_loop:
1689         leaq    (%rdi,%r9,1),%rdi
1690 .byte   0x66
1691         movq    0(%rdi),%rbx
1692         movq    8(%rdi),%r9
1693         movq    16(%rdi),%r10
1694         movq    24(%rdi),%r11
1695         movq    32(%rdi),%r12
1696         movq    40(%rdi),%r13
1697         movq    48(%rdi),%r14
1698         movq    56(%rdi),%r15
1699         movq    %rax,(%rdx)
1700         leaq    64(%rdi),%rdi
1701
1702 .byte   0x67
1703         movq    %rbx,%r8
1704         imulq   32+8(%rsp),%rbx
1705         movq    0(%rbp),%rax
1706         movl    $8,%ecx
1707         jmp     .L8x_reduce
1708
1709 .align  32
1710 .L8x_reduce:
1711         mulq    %rbx
1712         movq    8(%rbp),%rax
1713         negq    %r8
1714         movq    %rdx,%r8
1715         adcq    $0,%r8
1716
1717         mulq    %rbx
1718         addq    %rax,%r9
1719         movq    16(%rbp),%rax
1720         adcq    $0,%rdx
1721         addq    %r9,%r8
1722         movq    %rbx,48-8+8(%rsp,%rcx,8)
1723         movq    %rdx,%r9
1724         adcq    $0,%r9
1725
1726         mulq    %rbx
1727         addq    %rax,%r10
1728         movq    24(%rbp),%rax
1729         adcq    $0,%rdx
1730         addq    %r10,%r9
1731         movq    32+8(%rsp),%rsi
1732         movq    %rdx,%r10
1733         adcq    $0,%r10
1734
1735         mulq    %rbx
1736         addq    %rax,%r11
1737         movq    32(%rbp),%rax
1738         adcq    $0,%rdx
1739         imulq   %r8,%rsi
1740         addq    %r11,%r10
1741         movq    %rdx,%r11
1742         adcq    $0,%r11
1743
1744         mulq    %rbx
1745         addq    %rax,%r12
1746         movq    40(%rbp),%rax
1747         adcq    $0,%rdx
1748         addq    %r12,%r11
1749         movq    %rdx,%r12
1750         adcq    $0,%r12
1751
1752         mulq    %rbx
1753         addq    %rax,%r13
1754         movq    48(%rbp),%rax
1755         adcq    $0,%rdx
1756         addq    %r13,%r12
1757         movq    %rdx,%r13
1758         adcq    $0,%r13
1759
1760         mulq    %rbx
1761         addq    %rax,%r14
1762         movq    56(%rbp),%rax
1763         adcq    $0,%rdx
1764         addq    %r14,%r13
1765         movq    %rdx,%r14
1766         adcq    $0,%r14
1767
1768         mulq    %rbx
1769         movq    %rsi,%rbx
1770         addq    %rax,%r15
1771         movq    0(%rbp),%rax
1772         adcq    $0,%rdx
1773         addq    %r15,%r14
1774         movq    %rdx,%r15
1775         adcq    $0,%r15
1776
1777         decl    %ecx
1778         jnz     .L8x_reduce
1779
1780         leaq    64(%rbp),%rbp
1781         xorq    %rax,%rax
1782         movq    8+8(%rsp),%rdx
1783         cmpq    0+8(%rsp),%rbp
1784         jae     .L8x_no_tail
1785
1786 .byte   0x66
1787         addq    0(%rdi),%r8
1788         adcq    8(%rdi),%r9
1789         adcq    16(%rdi),%r10
1790         adcq    24(%rdi),%r11
1791         adcq    32(%rdi),%r12
1792         adcq    40(%rdi),%r13
1793         adcq    48(%rdi),%r14
1794         adcq    56(%rdi),%r15
1795         sbbq    %rsi,%rsi
1796
1797         movq    48+56+8(%rsp),%rbx
1798         movl    $8,%ecx
1799         movq    0(%rbp),%rax
1800         jmp     .L8x_tail
1801
1802 .align  32
1803 .L8x_tail:
1804         mulq    %rbx
1805         addq    %rax,%r8
1806         movq    8(%rbp),%rax
1807         movq    %r8,(%rdi)
1808         movq    %rdx,%r8
1809         adcq    $0,%r8
1810
1811         mulq    %rbx
1812         addq    %rax,%r9
1813         movq    16(%rbp),%rax
1814         adcq    $0,%rdx
1815         addq    %r9,%r8
1816         leaq    8(%rdi),%rdi
1817         movq    %rdx,%r9
1818         adcq    $0,%r9
1819
1820         mulq    %rbx
1821         addq    %rax,%r10
1822         movq    24(%rbp),%rax
1823         adcq    $0,%rdx
1824         addq    %r10,%r9
1825         movq    %rdx,%r10
1826         adcq    $0,%r10
1827
1828         mulq    %rbx
1829         addq    %rax,%r11
1830         movq    32(%rbp),%rax
1831         adcq    $0,%rdx
1832         addq    %r11,%r10
1833         movq    %rdx,%r11
1834         adcq    $0,%r11
1835
1836         mulq    %rbx
1837         addq    %rax,%r12
1838         movq    40(%rbp),%rax
1839         adcq    $0,%rdx
1840         addq    %r12,%r11
1841         movq    %rdx,%r12
1842         adcq    $0,%r12
1843
1844         mulq    %rbx
1845         addq    %rax,%r13
1846         movq    48(%rbp),%rax
1847         adcq    $0,%rdx
1848         addq    %r13,%r12
1849         movq    %rdx,%r13
1850         adcq    $0,%r13
1851
1852         mulq    %rbx
1853         addq    %rax,%r14
1854         movq    56(%rbp),%rax
1855         adcq    $0,%rdx
1856         addq    %r14,%r13
1857         movq    %rdx,%r14
1858         adcq    $0,%r14
1859
1860         mulq    %rbx
1861         movq    48-16+8(%rsp,%rcx,8),%rbx
1862         addq    %rax,%r15
1863         adcq    $0,%rdx
1864         addq    %r15,%r14
1865         movq    0(%rbp),%rax
1866         movq    %rdx,%r15
1867         adcq    $0,%r15
1868
1869         decl    %ecx
1870         jnz     .L8x_tail
1871
1872         leaq    64(%rbp),%rbp
1873         movq    8+8(%rsp),%rdx
1874         cmpq    0+8(%rsp),%rbp
1875         jae     .L8x_tail_done
1876
1877         movq    48+56+8(%rsp),%rbx
1878         negq    %rsi
1879         movq    0(%rbp),%rax
1880         adcq    0(%rdi),%r8
1881         adcq    8(%rdi),%r9
1882         adcq    16(%rdi),%r10
1883         adcq    24(%rdi),%r11
1884         adcq    32(%rdi),%r12
1885         adcq    40(%rdi),%r13
1886         adcq    48(%rdi),%r14
1887         adcq    56(%rdi),%r15
1888         sbbq    %rsi,%rsi
1889
1890         movl    $8,%ecx
1891         jmp     .L8x_tail
1892
1893 .align  32
1894 .L8x_tail_done:
1895         xorq    %rax,%rax
1896         addq    (%rdx),%r8
1897         adcq    $0,%r9
1898         adcq    $0,%r10
1899         adcq    $0,%r11
1900         adcq    $0,%r12
1901         adcq    $0,%r13
1902         adcq    $0,%r14
1903         adcq    $0,%r15
1904         adcq    $0,%rax
1905
1906         negq    %rsi
1907 .L8x_no_tail:
1908         adcq    0(%rdi),%r8
1909         adcq    8(%rdi),%r9
1910         adcq    16(%rdi),%r10
1911         adcq    24(%rdi),%r11
1912         adcq    32(%rdi),%r12
1913         adcq    40(%rdi),%r13
1914         adcq    48(%rdi),%r14
1915         adcq    56(%rdi),%r15
1916         adcq    $0,%rax
1917         movq    -8(%rbp),%rcx
1918         xorq    %rsi,%rsi
1919
1920 .byte   102,72,15,126,213
1921
1922         movq    %r8,0(%rdi)
1923         movq    %r9,8(%rdi)
1924 .byte   102,73,15,126,217
1925         movq    %r10,16(%rdi)
1926         movq    %r11,24(%rdi)
1927         movq    %r12,32(%rdi)
1928         movq    %r13,40(%rdi)
1929         movq    %r14,48(%rdi)
1930         movq    %r15,56(%rdi)
1931         leaq    64(%rdi),%rdi
1932
1933         cmpq    %rdx,%rdi
1934         jb      .L8x_reduction_loop
1935         .byte   0xf3,0xc3
1936 .size   bn_sqr8x_internal,.-bn_sqr8x_internal
1937 .type   __bn_post4x_internal,@function
1938 .align  32
1939 __bn_post4x_internal:
1940         movq    0(%rbp),%r12
1941         leaq    (%rdi,%r9,1),%rbx
1942         movq    %r9,%rcx
1943 .byte   102,72,15,126,207
1944         negq    %rax
1945 .byte   102,72,15,126,206
1946         sarq    $3+2,%rcx
1947         decq    %r12
1948         xorq    %r10,%r10
1949         movq    8(%rbp),%r13
1950         movq    16(%rbp),%r14
1951         movq    24(%rbp),%r15
1952         jmp     .Lsqr4x_sub_entry
1953
1954 .align  16
1955 .Lsqr4x_sub:
1956         movq    0(%rbp),%r12
1957         movq    8(%rbp),%r13
1958         movq    16(%rbp),%r14
1959         movq    24(%rbp),%r15
1960 .Lsqr4x_sub_entry:
1961         leaq    32(%rbp),%rbp
1962         notq    %r12
1963         notq    %r13
1964         notq    %r14
1965         notq    %r15
1966         andq    %rax,%r12
1967         andq    %rax,%r13
1968         andq    %rax,%r14
1969         andq    %rax,%r15
1970
1971         negq    %r10
1972         adcq    0(%rbx),%r12
1973         adcq    8(%rbx),%r13
1974         adcq    16(%rbx),%r14
1975         adcq    24(%rbx),%r15
1976         movq    %r12,0(%rdi)
1977         leaq    32(%rbx),%rbx
1978         movq    %r13,8(%rdi)
1979         sbbq    %r10,%r10
1980         movq    %r14,16(%rdi)
1981         movq    %r15,24(%rdi)
1982         leaq    32(%rdi),%rdi
1983
1984         incq    %rcx
1985         jnz     .Lsqr4x_sub
1986
1987         movq    %r9,%r10
1988         negq    %r9
1989         .byte   0xf3,0xc3
1990 .size   __bn_post4x_internal,.-__bn_post4x_internal
1991 .globl  bn_from_montgomery
1992 .type   bn_from_montgomery,@function
1993 .align  32
1994 bn_from_montgomery:
1995         testl   $7,%r9d
1996         jz      bn_from_mont8x
1997         xorl    %eax,%eax
1998         .byte   0xf3,0xc3
1999 .size   bn_from_montgomery,.-bn_from_montgomery
2000
2001 .type   bn_from_mont8x,@function
2002 .align  32
2003 bn_from_mont8x:
2004 .byte   0x67
2005         movq    %rsp,%rax
2006         pushq   %rbx
2007         pushq   %rbp
2008         pushq   %r12
2009         pushq   %r13
2010         pushq   %r14
2011         pushq   %r15
2012 .Lfrom_prologue:
2013
2014         shll    $3,%r9d
2015         leaq    (%r9,%r9,2),%r10
2016         negq    %r9
2017         movq    (%r8),%r8
2018
2019
2020
2021
2022
2023
2024
2025
2026         leaq    -320(%rsp,%r9,2),%r11
2027         movq    %rsp,%rbp
2028         subq    %rdi,%r11
2029         andq    $4095,%r11
2030         cmpq    %r11,%r10
2031         jb      .Lfrom_sp_alt
2032         subq    %r11,%rbp
2033         leaq    -320(%rbp,%r9,2),%rbp
2034         jmp     .Lfrom_sp_done
2035
2036 .align  32
2037 .Lfrom_sp_alt:
2038         leaq    4096-320(,%r9,2),%r10
2039         leaq    -320(%rbp,%r9,2),%rbp
2040         subq    %r10,%r11
2041         movq    $0,%r10
2042         cmovcq  %r10,%r11
2043         subq    %r11,%rbp
2044 .Lfrom_sp_done:
2045         andq    $-64,%rbp
2046         movq    %rsp,%r11
2047         subq    %rbp,%r11
2048         andq    $-4096,%r11
2049         leaq    (%r11,%rbp,1),%rsp
2050         movq    (%rsp),%r10
2051         cmpq    %rbp,%rsp
2052         ja      .Lfrom_page_walk
2053         jmp     .Lfrom_page_walk_done
2054
2055 .Lfrom_page_walk:
2056         leaq    -4096(%rsp),%rsp
2057         movq    (%rsp),%r10
2058         cmpq    %rbp,%rsp
2059         ja      .Lfrom_page_walk
2060 .Lfrom_page_walk_done:
2061
2062         movq    %r9,%r10
2063         negq    %r9
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074         movq    %r8,32(%rsp)
2075         movq    %rax,40(%rsp)
2076 .Lfrom_body:
2077         movq    %r9,%r11
2078         leaq    48(%rsp),%rax
2079         pxor    %xmm0,%xmm0
2080         jmp     .Lmul_by_1
2081
2082 .align  32
2083 .Lmul_by_1:
2084         movdqu  (%rsi),%xmm1
2085         movdqu  16(%rsi),%xmm2
2086         movdqu  32(%rsi),%xmm3
2087         movdqa  %xmm0,(%rax,%r9,1)
2088         movdqu  48(%rsi),%xmm4
2089         movdqa  %xmm0,16(%rax,%r9,1)
2090 .byte   0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2091         movdqa  %xmm1,(%rax)
2092         movdqa  %xmm0,32(%rax,%r9,1)
2093         movdqa  %xmm2,16(%rax)
2094         movdqa  %xmm0,48(%rax,%r9,1)
2095         movdqa  %xmm3,32(%rax)
2096         movdqa  %xmm4,48(%rax)
2097         leaq    64(%rax),%rax
2098         subq    $64,%r11
2099         jnz     .Lmul_by_1
2100
2101 .byte   102,72,15,110,207
2102 .byte   102,72,15,110,209
2103 .byte   0x67
2104         movq    %rcx,%rbp
2105 .byte   102,73,15,110,218
2106         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
2107         andl    $0x80108,%r11d
2108         cmpl    $0x80108,%r11d
2109         jne     .Lfrom_mont_nox
2110
2111         leaq    (%rax,%r9,1),%rdi
2112         call    __bn_sqrx8x_reduction
2113         call    __bn_postx4x_internal
2114
2115         pxor    %xmm0,%xmm0
2116         leaq    48(%rsp),%rax
2117         movq    40(%rsp),%rsi
2118         jmp     .Lfrom_mont_zero
2119
2120 .align  32
2121 .Lfrom_mont_nox:
2122         call    __bn_sqr8x_reduction
2123         call    __bn_post4x_internal
2124
2125         pxor    %xmm0,%xmm0
2126         leaq    48(%rsp),%rax
2127         movq    40(%rsp),%rsi
2128         jmp     .Lfrom_mont_zero
2129
2130 .align  32
2131 .Lfrom_mont_zero:
2132         movdqa  %xmm0,0(%rax)
2133         movdqa  %xmm0,16(%rax)
2134         movdqa  %xmm0,32(%rax)
2135         movdqa  %xmm0,48(%rax)
2136         leaq    64(%rax),%rax
2137         subq    $32,%r9
2138         jnz     .Lfrom_mont_zero
2139
2140         movq    $1,%rax
2141         movq    -48(%rsi),%r15
2142         movq    -40(%rsi),%r14
2143         movq    -32(%rsi),%r13
2144         movq    -24(%rsi),%r12
2145         movq    -16(%rsi),%rbp
2146         movq    -8(%rsi),%rbx
2147         leaq    (%rsi),%rsp
2148 .Lfrom_epilogue:
2149         .byte   0xf3,0xc3
2150 .size   bn_from_mont8x,.-bn_from_mont8x
2151 .type   bn_mulx4x_mont_gather5,@function
2152 .align  32
2153 bn_mulx4x_mont_gather5:
2154         movq    %rsp,%rax
2155 .Lmulx4x_enter:
2156         pushq   %rbx
2157         pushq   %rbp
2158         pushq   %r12
2159         pushq   %r13
2160         pushq   %r14
2161         pushq   %r15
2162 .Lmulx4x_prologue:
2163
2164         shll    $3,%r9d
2165         leaq    (%r9,%r9,2),%r10
2166         negq    %r9
2167         movq    (%r8),%r8
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178         leaq    -320(%rsp,%r9,2),%r11
2179         movq    %rsp,%rbp
2180         subq    %rdi,%r11
2181         andq    $4095,%r11
2182         cmpq    %r11,%r10
2183         jb      .Lmulx4xsp_alt
2184         subq    %r11,%rbp
2185         leaq    -320(%rbp,%r9,2),%rbp
2186         jmp     .Lmulx4xsp_done
2187
2188 .Lmulx4xsp_alt:
2189         leaq    4096-320(,%r9,2),%r10
2190         leaq    -320(%rbp,%r9,2),%rbp
2191         subq    %r10,%r11
2192         movq    $0,%r10
2193         cmovcq  %r10,%r11
2194         subq    %r11,%rbp
2195 .Lmulx4xsp_done:
2196         andq    $-64,%rbp
2197         movq    %rsp,%r11
2198         subq    %rbp,%r11
2199         andq    $-4096,%r11
2200         leaq    (%r11,%rbp,1),%rsp
2201         movq    (%rsp),%r10
2202         cmpq    %rbp,%rsp
2203         ja      .Lmulx4x_page_walk
2204         jmp     .Lmulx4x_page_walk_done
2205
2206 .Lmulx4x_page_walk:
2207         leaq    -4096(%rsp),%rsp
2208         movq    (%rsp),%r10
2209         cmpq    %rbp,%rsp
2210         ja      .Lmulx4x_page_walk
2211 .Lmulx4x_page_walk_done:
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225         movq    %r8,32(%rsp)
2226         movq    %rax,40(%rsp)
2227 .Lmulx4x_body:
2228         call    mulx4x_internal
2229
2230         movq    40(%rsp),%rsi
2231         movq    $1,%rax
2232
2233         movq    -48(%rsi),%r15
2234         movq    -40(%rsi),%r14
2235         movq    -32(%rsi),%r13
2236         movq    -24(%rsi),%r12
2237         movq    -16(%rsi),%rbp
2238         movq    -8(%rsi),%rbx
2239         leaq    (%rsi),%rsp
2240 .Lmulx4x_epilogue:
2241         .byte   0xf3,0xc3
2242 .size   bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2243
2244 .type   mulx4x_internal,@function
2245 .align  32
2246 mulx4x_internal:
2247         movq    %r9,8(%rsp)
2248         movq    %r9,%r10
2249         negq    %r9
2250         shlq    $5,%r9
2251         negq    %r10
2252         leaq    128(%rdx,%r9,1),%r13
2253         shrq    $5+5,%r9
2254         movd    8(%rax),%xmm5
2255         subq    $1,%r9
2256         leaq    .Linc(%rip),%rax
2257         movq    %r13,16+8(%rsp)
2258         movq    %r9,24+8(%rsp)
2259         movq    %rdi,56+8(%rsp)
2260         movdqa  0(%rax),%xmm0
2261         movdqa  16(%rax),%xmm1
2262         leaq    88-112(%rsp,%r10,1),%r10
2263         leaq    128(%rdx),%rdi
2264
2265         pshufd  $0,%xmm5,%xmm5
2266         movdqa  %xmm1,%xmm4
2267 .byte   0x67
2268         movdqa  %xmm1,%xmm2
2269 .byte   0x67
2270         paddd   %xmm0,%xmm1
2271         pcmpeqd %xmm5,%xmm0
2272         movdqa  %xmm4,%xmm3
2273         paddd   %xmm1,%xmm2
2274         pcmpeqd %xmm5,%xmm1
2275         movdqa  %xmm0,112(%r10)
2276         movdqa  %xmm4,%xmm0
2277
2278         paddd   %xmm2,%xmm3
2279         pcmpeqd %xmm5,%xmm2
2280         movdqa  %xmm1,128(%r10)
2281         movdqa  %xmm4,%xmm1
2282
2283         paddd   %xmm3,%xmm0
2284         pcmpeqd %xmm5,%xmm3
2285         movdqa  %xmm2,144(%r10)
2286         movdqa  %xmm4,%xmm2
2287
2288         paddd   %xmm0,%xmm1
2289         pcmpeqd %xmm5,%xmm0
2290         movdqa  %xmm3,160(%r10)
2291         movdqa  %xmm4,%xmm3
2292         paddd   %xmm1,%xmm2
2293         pcmpeqd %xmm5,%xmm1
2294         movdqa  %xmm0,176(%r10)
2295         movdqa  %xmm4,%xmm0
2296
2297         paddd   %xmm2,%xmm3
2298         pcmpeqd %xmm5,%xmm2
2299         movdqa  %xmm1,192(%r10)
2300         movdqa  %xmm4,%xmm1
2301
2302         paddd   %xmm3,%xmm0
2303         pcmpeqd %xmm5,%xmm3
2304         movdqa  %xmm2,208(%r10)
2305         movdqa  %xmm4,%xmm2
2306
2307         paddd   %xmm0,%xmm1
2308         pcmpeqd %xmm5,%xmm0
2309         movdqa  %xmm3,224(%r10)
2310         movdqa  %xmm4,%xmm3
2311         paddd   %xmm1,%xmm2
2312         pcmpeqd %xmm5,%xmm1
2313         movdqa  %xmm0,240(%r10)
2314         movdqa  %xmm4,%xmm0
2315
2316         paddd   %xmm2,%xmm3
2317         pcmpeqd %xmm5,%xmm2
2318         movdqa  %xmm1,256(%r10)
2319         movdqa  %xmm4,%xmm1
2320
2321         paddd   %xmm3,%xmm0
2322         pcmpeqd %xmm5,%xmm3
2323         movdqa  %xmm2,272(%r10)
2324         movdqa  %xmm4,%xmm2
2325
2326         paddd   %xmm0,%xmm1
2327         pcmpeqd %xmm5,%xmm0
2328         movdqa  %xmm3,288(%r10)
2329         movdqa  %xmm4,%xmm3
2330 .byte   0x67
2331         paddd   %xmm1,%xmm2
2332         pcmpeqd %xmm5,%xmm1
2333         movdqa  %xmm0,304(%r10)
2334
2335         paddd   %xmm2,%xmm3
2336         pcmpeqd %xmm5,%xmm2
2337         movdqa  %xmm1,320(%r10)
2338
2339         pcmpeqd %xmm5,%xmm3
2340         movdqa  %xmm2,336(%r10)
2341
2342         pand    64(%rdi),%xmm0
2343         pand    80(%rdi),%xmm1
2344         pand    96(%rdi),%xmm2
2345         movdqa  %xmm3,352(%r10)
2346         pand    112(%rdi),%xmm3
2347         por     %xmm2,%xmm0
2348         por     %xmm3,%xmm1
2349         movdqa  -128(%rdi),%xmm4
2350         movdqa  -112(%rdi),%xmm5
2351         movdqa  -96(%rdi),%xmm2
2352         pand    112(%r10),%xmm4
2353         movdqa  -80(%rdi),%xmm3
2354         pand    128(%r10),%xmm5
2355         por     %xmm4,%xmm0
2356         pand    144(%r10),%xmm2
2357         por     %xmm5,%xmm1
2358         pand    160(%r10),%xmm3
2359         por     %xmm2,%xmm0
2360         por     %xmm3,%xmm1
2361         movdqa  -64(%rdi),%xmm4
2362         movdqa  -48(%rdi),%xmm5
2363         movdqa  -32(%rdi),%xmm2
2364         pand    176(%r10),%xmm4
2365         movdqa  -16(%rdi),%xmm3
2366         pand    192(%r10),%xmm5
2367         por     %xmm4,%xmm0
2368         pand    208(%r10),%xmm2
2369         por     %xmm5,%xmm1
2370         pand    224(%r10),%xmm3
2371         por     %xmm2,%xmm0
2372         por     %xmm3,%xmm1
2373         movdqa  0(%rdi),%xmm4
2374         movdqa  16(%rdi),%xmm5
2375         movdqa  32(%rdi),%xmm2
2376         pand    240(%r10),%xmm4
2377         movdqa  48(%rdi),%xmm3
2378         pand    256(%r10),%xmm5
2379         por     %xmm4,%xmm0
2380         pand    272(%r10),%xmm2
2381         por     %xmm5,%xmm1
2382         pand    288(%r10),%xmm3
2383         por     %xmm2,%xmm0
2384         por     %xmm3,%xmm1
2385         pxor    %xmm1,%xmm0
2386         pshufd  $0x4e,%xmm0,%xmm1
2387         por     %xmm1,%xmm0
2388         leaq    256(%rdi),%rdi
2389 .byte   102,72,15,126,194
2390         leaq    64+32+8(%rsp),%rbx
2391
2392         movq    %rdx,%r9
2393         mulxq   0(%rsi),%r8,%rax
2394         mulxq   8(%rsi),%r11,%r12
2395         addq    %rax,%r11
2396         mulxq   16(%rsi),%rax,%r13
2397         adcq    %rax,%r12
2398         adcq    $0,%r13
2399         mulxq   24(%rsi),%rax,%r14
2400
2401         movq    %r8,%r15
2402         imulq   32+8(%rsp),%r8
2403         xorq    %rbp,%rbp
2404         movq    %r8,%rdx
2405
2406         movq    %rdi,8+8(%rsp)
2407
2408         leaq    32(%rsi),%rsi
2409         adcxq   %rax,%r13
2410         adcxq   %rbp,%r14
2411
2412         mulxq   0(%rcx),%rax,%r10
2413         adcxq   %rax,%r15
2414         adoxq   %r11,%r10
2415         mulxq   8(%rcx),%rax,%r11
2416         adcxq   %rax,%r10
2417         adoxq   %r12,%r11
2418         mulxq   16(%rcx),%rax,%r12
2419         movq    24+8(%rsp),%rdi
2420         movq    %r10,-32(%rbx)
2421         adcxq   %rax,%r11
2422         adoxq   %r13,%r12
2423         mulxq   24(%rcx),%rax,%r15
2424         movq    %r9,%rdx
2425         movq    %r11,-24(%rbx)
2426         adcxq   %rax,%r12
2427         adoxq   %rbp,%r15
2428         leaq    32(%rcx),%rcx
2429         movq    %r12,-16(%rbx)
2430         jmp     .Lmulx4x_1st
2431
2432 .align  32
2433 .Lmulx4x_1st:
2434         adcxq   %rbp,%r15
2435         mulxq   0(%rsi),%r10,%rax
2436         adcxq   %r14,%r10
2437         mulxq   8(%rsi),%r11,%r14
2438         adcxq   %rax,%r11
2439         mulxq   16(%rsi),%r12,%rax
2440         adcxq   %r14,%r12
2441         mulxq   24(%rsi),%r13,%r14
2442 .byte   0x67,0x67
2443         movq    %r8,%rdx
2444         adcxq   %rax,%r13
2445         adcxq   %rbp,%r14
2446         leaq    32(%rsi),%rsi
2447         leaq    32(%rbx),%rbx
2448
2449         adoxq   %r15,%r10
2450         mulxq   0(%rcx),%rax,%r15
2451         adcxq   %rax,%r10
2452         adoxq   %r15,%r11
2453         mulxq   8(%rcx),%rax,%r15
2454         adcxq   %rax,%r11
2455         adoxq   %r15,%r12
2456         mulxq   16(%rcx),%rax,%r15
2457         movq    %r10,-40(%rbx)
2458         adcxq   %rax,%r12
2459         movq    %r11,-32(%rbx)
2460         adoxq   %r15,%r13
2461         mulxq   24(%rcx),%rax,%r15
2462         movq    %r9,%rdx
2463         movq    %r12,-24(%rbx)
2464         adcxq   %rax,%r13
2465         adoxq   %rbp,%r15
2466         leaq    32(%rcx),%rcx
2467         movq    %r13,-16(%rbx)
2468
2469         decq    %rdi
2470         jnz     .Lmulx4x_1st
2471
2472         movq    8(%rsp),%rax
2473         adcq    %rbp,%r15
2474         leaq    (%rsi,%rax,1),%rsi
2475         addq    %r15,%r14
2476         movq    8+8(%rsp),%rdi
2477         adcq    %rbp,%rbp
2478         movq    %r14,-8(%rbx)
2479         jmp     .Lmulx4x_outer
2480
2481 .align  32
2482 .Lmulx4x_outer:
2483         leaq    16-256(%rbx),%r10
2484         pxor    %xmm4,%xmm4
2485 .byte   0x67,0x67
2486         pxor    %xmm5,%xmm5
2487         movdqa  -128(%rdi),%xmm0
2488         movdqa  -112(%rdi),%xmm1
2489         movdqa  -96(%rdi),%xmm2
2490         pand    256(%r10),%xmm0
2491         movdqa  -80(%rdi),%xmm3
2492         pand    272(%r10),%xmm1
2493         por     %xmm0,%xmm4
2494         pand    288(%r10),%xmm2
2495         por     %xmm1,%xmm5
2496         pand    304(%r10),%xmm3
2497         por     %xmm2,%xmm4
2498         por     %xmm3,%xmm5
2499         movdqa  -64(%rdi),%xmm0
2500         movdqa  -48(%rdi),%xmm1
2501         movdqa  -32(%rdi),%xmm2
2502         pand    320(%r10),%xmm0
2503         movdqa  -16(%rdi),%xmm3
2504         pand    336(%r10),%xmm1
2505         por     %xmm0,%xmm4
2506         pand    352(%r10),%xmm2
2507         por     %xmm1,%xmm5
2508         pand    368(%r10),%xmm3
2509         por     %xmm2,%xmm4
2510         por     %xmm3,%xmm5
2511         movdqa  0(%rdi),%xmm0
2512         movdqa  16(%rdi),%xmm1
2513         movdqa  32(%rdi),%xmm2
2514         pand    384(%r10),%xmm0
2515         movdqa  48(%rdi),%xmm3
2516         pand    400(%r10),%xmm1
2517         por     %xmm0,%xmm4
2518         pand    416(%r10),%xmm2
2519         por     %xmm1,%xmm5
2520         pand    432(%r10),%xmm3
2521         por     %xmm2,%xmm4
2522         por     %xmm3,%xmm5
2523         movdqa  64(%rdi),%xmm0
2524         movdqa  80(%rdi),%xmm1
2525         movdqa  96(%rdi),%xmm2
2526         pand    448(%r10),%xmm0
2527         movdqa  112(%rdi),%xmm3
2528         pand    464(%r10),%xmm1
2529         por     %xmm0,%xmm4
2530         pand    480(%r10),%xmm2
2531         por     %xmm1,%xmm5
2532         pand    496(%r10),%xmm3
2533         por     %xmm2,%xmm4
2534         por     %xmm3,%xmm5
2535         por     %xmm5,%xmm4
2536         pshufd  $0x4e,%xmm4,%xmm0
2537         por     %xmm4,%xmm0
2538         leaq    256(%rdi),%rdi
2539 .byte   102,72,15,126,194
2540
2541         movq    %rbp,(%rbx)
2542         leaq    32(%rbx,%rax,1),%rbx
2543         mulxq   0(%rsi),%r8,%r11
2544         xorq    %rbp,%rbp
2545         movq    %rdx,%r9
2546         mulxq   8(%rsi),%r14,%r12
2547         adoxq   -32(%rbx),%r8
2548         adcxq   %r14,%r11
2549         mulxq   16(%rsi),%r15,%r13
2550         adoxq   -24(%rbx),%r11
2551         adcxq   %r15,%r12
2552         mulxq   24(%rsi),%rdx,%r14
2553         adoxq   -16(%rbx),%r12
2554         adcxq   %rdx,%r13
2555         leaq    (%rcx,%rax,1),%rcx
2556         leaq    32(%rsi),%rsi
2557         adoxq   -8(%rbx),%r13
2558         adcxq   %rbp,%r14
2559         adoxq   %rbp,%r14
2560
2561         movq    %r8,%r15
2562         imulq   32+8(%rsp),%r8
2563
2564         movq    %r8,%rdx
2565         xorq    %rbp,%rbp
2566         movq    %rdi,8+8(%rsp)
2567
2568         mulxq   0(%rcx),%rax,%r10
2569         adcxq   %rax,%r15
2570         adoxq   %r11,%r10
2571         mulxq   8(%rcx),%rax,%r11
2572         adcxq   %rax,%r10
2573         adoxq   %r12,%r11
2574         mulxq   16(%rcx),%rax,%r12
2575         adcxq   %rax,%r11
2576         adoxq   %r13,%r12
2577         mulxq   24(%rcx),%rax,%r15
2578         movq    %r9,%rdx
2579         movq    24+8(%rsp),%rdi
2580         movq    %r10,-32(%rbx)
2581         adcxq   %rax,%r12
2582         movq    %r11,-24(%rbx)
2583         adoxq   %rbp,%r15
2584         movq    %r12,-16(%rbx)
2585         leaq    32(%rcx),%rcx
2586         jmp     .Lmulx4x_inner
2587
2588 .align  32
2589 .Lmulx4x_inner:
2590         mulxq   0(%rsi),%r10,%rax
2591         adcxq   %rbp,%r15
2592         adoxq   %r14,%r10
2593         mulxq   8(%rsi),%r11,%r14
2594         adcxq   0(%rbx),%r10
2595         adoxq   %rax,%r11
2596         mulxq   16(%rsi),%r12,%rax
2597         adcxq   8(%rbx),%r11
2598         adoxq   %r14,%r12
2599         mulxq   24(%rsi),%r13,%r14
2600         movq    %r8,%rdx
2601         adcxq   16(%rbx),%r12
2602         adoxq   %rax,%r13
2603         adcxq   24(%rbx),%r13
2604         adoxq   %rbp,%r14
2605         leaq    32(%rsi),%rsi
2606         leaq    32(%rbx),%rbx
2607         adcxq   %rbp,%r14
2608
2609         adoxq   %r15,%r10
2610         mulxq   0(%rcx),%rax,%r15
2611         adcxq   %rax,%r10
2612         adoxq   %r15,%r11
2613         mulxq   8(%rcx),%rax,%r15
2614         adcxq   %rax,%r11
2615         adoxq   %r15,%r12
2616         mulxq   16(%rcx),%rax,%r15
2617         movq    %r10,-40(%rbx)
2618         adcxq   %rax,%r12
2619         adoxq   %r15,%r13
2620         movq    %r11,-32(%rbx)
2621         mulxq   24(%rcx),%rax,%r15
2622         movq    %r9,%rdx
2623         leaq    32(%rcx),%rcx
2624         movq    %r12,-24(%rbx)
2625         adcxq   %rax,%r13
2626         adoxq   %rbp,%r15
2627         movq    %r13,-16(%rbx)
2628
2629         decq    %rdi
2630         jnz     .Lmulx4x_inner
2631
2632         movq    0+8(%rsp),%rax
2633         adcq    %rbp,%r15
2634         subq    0(%rbx),%rdi
2635         movq    8+8(%rsp),%rdi
2636         movq    16+8(%rsp),%r10
2637         adcq    %r15,%r14
2638         leaq    (%rsi,%rax,1),%rsi
2639         adcq    %rbp,%rbp
2640         movq    %r14,-8(%rbx)
2641
2642         cmpq    %r10,%rdi
2643         jb      .Lmulx4x_outer
2644
2645         movq    -8(%rcx),%r10
2646         movq    %rbp,%r8
2647         movq    (%rcx,%rax,1),%r12
2648         leaq    (%rcx,%rax,1),%rbp
2649         movq    %rax,%rcx
2650         leaq    (%rbx,%rax,1),%rdi
2651         xorl    %eax,%eax
2652         xorq    %r15,%r15
2653         subq    %r14,%r10
2654         adcq    %r15,%r15
2655         orq     %r15,%r8
2656         sarq    $3+2,%rcx
2657         subq    %r8,%rax
2658         movq    56+8(%rsp),%rdx
2659         decq    %r12
2660         movq    8(%rbp),%r13
2661         xorq    %r8,%r8
2662         movq    16(%rbp),%r14
2663         movq    24(%rbp),%r15
2664         jmp     .Lsqrx4x_sub_entry
2665 .size   mulx4x_internal,.-mulx4x_internal
2666 .type   bn_powerx5,@function
2667 .align  32
2668 bn_powerx5:
2669         movq    %rsp,%rax
2670 .Lpowerx5_enter:
2671         pushq   %rbx
2672         pushq   %rbp
2673         pushq   %r12
2674         pushq   %r13
2675         pushq   %r14
2676         pushq   %r15
2677 .Lpowerx5_prologue:
2678
2679         shll    $3,%r9d
2680         leaq    (%r9,%r9,2),%r10
2681         negq    %r9
2682         movq    (%r8),%r8
2683
2684
2685
2686
2687
2688
2689
2690
2691         leaq    -320(%rsp,%r9,2),%r11
2692         movq    %rsp,%rbp
2693         subq    %rdi,%r11
2694         andq    $4095,%r11
2695         cmpq    %r11,%r10
2696         jb      .Lpwrx_sp_alt
2697         subq    %r11,%rbp
2698         leaq    -320(%rbp,%r9,2),%rbp
2699         jmp     .Lpwrx_sp_done
2700
2701 .align  32
2702 .Lpwrx_sp_alt:
2703         leaq    4096-320(,%r9,2),%r10
2704         leaq    -320(%rbp,%r9,2),%rbp
2705         subq    %r10,%r11
2706         movq    $0,%r10
2707         cmovcq  %r10,%r11
2708         subq    %r11,%rbp
2709 .Lpwrx_sp_done:
2710         andq    $-64,%rbp
2711         movq    %rsp,%r11
2712         subq    %rbp,%r11
2713         andq    $-4096,%r11
2714         leaq    (%r11,%rbp,1),%rsp
2715         movq    (%rsp),%r10
2716         cmpq    %rbp,%rsp
2717         ja      .Lpwrx_page_walk
2718         jmp     .Lpwrx_page_walk_done
2719
2720 .Lpwrx_page_walk:
2721         leaq    -4096(%rsp),%rsp
2722         movq    (%rsp),%r10
2723         cmpq    %rbp,%rsp
2724         ja      .Lpwrx_page_walk
2725 .Lpwrx_page_walk_done:
2726
2727         movq    %r9,%r10
2728         negq    %r9
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741         pxor    %xmm0,%xmm0
2742 .byte   102,72,15,110,207
2743 .byte   102,72,15,110,209
2744 .byte   102,73,15,110,218
2745 .byte   102,72,15,110,226
2746         movq    %r8,32(%rsp)
2747         movq    %rax,40(%rsp)
2748 .Lpowerx5_body:
2749
2750         call    __bn_sqrx8x_internal
2751         call    __bn_postx4x_internal
2752         call    __bn_sqrx8x_internal
2753         call    __bn_postx4x_internal
2754         call    __bn_sqrx8x_internal
2755         call    __bn_postx4x_internal
2756         call    __bn_sqrx8x_internal
2757         call    __bn_postx4x_internal
2758         call    __bn_sqrx8x_internal
2759         call    __bn_postx4x_internal
2760
2761         movq    %r10,%r9
2762         movq    %rsi,%rdi
2763 .byte   102,72,15,126,209
2764 .byte   102,72,15,126,226
2765         movq    40(%rsp),%rax
2766
2767         call    mulx4x_internal
2768
2769         movq    40(%rsp),%rsi
2770         movq    $1,%rax
2771
2772         movq    -48(%rsi),%r15
2773         movq    -40(%rsi),%r14
2774         movq    -32(%rsi),%r13
2775         movq    -24(%rsi),%r12
2776         movq    -16(%rsi),%rbp
2777         movq    -8(%rsi),%rbx
2778         leaq    (%rsi),%rsp
2779 .Lpowerx5_epilogue:
2780         .byte   0xf3,0xc3
2781 .size   bn_powerx5,.-bn_powerx5
2782
2783 .globl  bn_sqrx8x_internal
2784 .hidden bn_sqrx8x_internal
2785 .type   bn_sqrx8x_internal,@function
2786 .align  32
2787 bn_sqrx8x_internal:
2788 __bn_sqrx8x_internal:
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829         leaq    48+8(%rsp),%rdi
2830         leaq    (%rsi,%r9,1),%rbp
2831         movq    %r9,0+8(%rsp)
2832         movq    %rbp,8+8(%rsp)
2833         jmp     .Lsqr8x_zero_start
2834
2835 .align  32
2836 .byte   0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2837 .Lsqrx8x_zero:
2838 .byte   0x3e
2839         movdqa  %xmm0,0(%rdi)
2840         movdqa  %xmm0,16(%rdi)
2841         movdqa  %xmm0,32(%rdi)
2842         movdqa  %xmm0,48(%rdi)
2843 .Lsqr8x_zero_start:
2844         movdqa  %xmm0,64(%rdi)
2845         movdqa  %xmm0,80(%rdi)
2846         movdqa  %xmm0,96(%rdi)
2847         movdqa  %xmm0,112(%rdi)
2848         leaq    128(%rdi),%rdi
2849         subq    $64,%r9
2850         jnz     .Lsqrx8x_zero
2851
2852         movq    0(%rsi),%rdx
2853
2854         xorq    %r10,%r10
2855         xorq    %r11,%r11
2856         xorq    %r12,%r12
2857         xorq    %r13,%r13
2858         xorq    %r14,%r14
2859         xorq    %r15,%r15
2860         leaq    48+8(%rsp),%rdi
2861         xorq    %rbp,%rbp
2862         jmp     .Lsqrx8x_outer_loop
2863
2864 .align  32
2865 .Lsqrx8x_outer_loop:
2866         mulxq   8(%rsi),%r8,%rax
2867         adcxq   %r9,%r8
2868         adoxq   %rax,%r10
2869         mulxq   16(%rsi),%r9,%rax
2870         adcxq   %r10,%r9
2871         adoxq   %rax,%r11
2872 .byte   0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2873         adcxq   %r11,%r10
2874         adoxq   %rax,%r12
2875 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2876         adcxq   %r12,%r11
2877         adoxq   %rax,%r13
2878         mulxq   40(%rsi),%r12,%rax
2879         adcxq   %r13,%r12
2880         adoxq   %rax,%r14
2881         mulxq   48(%rsi),%r13,%rax
2882         adcxq   %r14,%r13
2883         adoxq   %r15,%rax
2884         mulxq   56(%rsi),%r14,%r15
2885         movq    8(%rsi),%rdx
2886         adcxq   %rax,%r14
2887         adoxq   %rbp,%r15
2888         adcq    64(%rdi),%r15
2889         movq    %r8,8(%rdi)
2890         movq    %r9,16(%rdi)
2891         sbbq    %rcx,%rcx
2892         xorq    %rbp,%rbp
2893
2894
2895         mulxq   16(%rsi),%r8,%rbx
2896         mulxq   24(%rsi),%r9,%rax
2897         adcxq   %r10,%r8
2898         adoxq   %rbx,%r9
2899         mulxq   32(%rsi),%r10,%rbx
2900         adcxq   %r11,%r9
2901         adoxq   %rax,%r10
2902 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2903         adcxq   %r12,%r10
2904         adoxq   %rbx,%r11
2905 .byte   0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2906         adcxq   %r13,%r11
2907         adoxq   %r14,%r12
2908 .byte   0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2909         movq    16(%rsi),%rdx
2910         adcxq   %rax,%r12
2911         adoxq   %rbx,%r13
2912         adcxq   %r15,%r13
2913         adoxq   %rbp,%r14
2914         adcxq   %rbp,%r14
2915
2916         movq    %r8,24(%rdi)
2917         movq    %r9,32(%rdi)
2918
2919         mulxq   24(%rsi),%r8,%rbx
2920         mulxq   32(%rsi),%r9,%rax
2921         adcxq   %r10,%r8
2922         adoxq   %rbx,%r9
2923         mulxq   40(%rsi),%r10,%rbx
2924         adcxq   %r11,%r9
2925         adoxq   %rax,%r10
2926 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2927         adcxq   %r12,%r10
2928         adoxq   %r13,%r11
2929 .byte   0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2930 .byte   0x3e
2931         movq    24(%rsi),%rdx
2932         adcxq   %rbx,%r11
2933         adoxq   %rax,%r12
2934         adcxq   %r14,%r12
2935         movq    %r8,40(%rdi)
2936         movq    %r9,48(%rdi)
2937         mulxq   32(%rsi),%r8,%rax
2938         adoxq   %rbp,%r13
2939         adcxq   %rbp,%r13
2940
2941         mulxq   40(%rsi),%r9,%rbx
2942         adcxq   %r10,%r8
2943         adoxq   %rax,%r9
2944         mulxq   48(%rsi),%r10,%rax
2945         adcxq   %r11,%r9
2946         adoxq   %r12,%r10
2947         mulxq   56(%rsi),%r11,%r12
2948         movq    32(%rsi),%rdx
2949         movq    40(%rsi),%r14
2950         adcxq   %rbx,%r10
2951         adoxq   %rax,%r11
2952         movq    48(%rsi),%r15
2953         adcxq   %r13,%r11
2954         adoxq   %rbp,%r12
2955         adcxq   %rbp,%r12
2956
2957         movq    %r8,56(%rdi)
2958         movq    %r9,64(%rdi)
2959
2960         mulxq   %r14,%r9,%rax
2961         movq    56(%rsi),%r8
2962         adcxq   %r10,%r9
2963         mulxq   %r15,%r10,%rbx
2964         adoxq   %rax,%r10
2965         adcxq   %r11,%r10
2966         mulxq   %r8,%r11,%rax
2967         movq    %r14,%rdx
2968         adoxq   %rbx,%r11
2969         adcxq   %r12,%r11
2970
2971         adcxq   %rbp,%rax
2972
2973         mulxq   %r15,%r14,%rbx
2974         mulxq   %r8,%r12,%r13
2975         movq    %r15,%rdx
2976         leaq    64(%rsi),%rsi
2977         adcxq   %r14,%r11
2978         adoxq   %rbx,%r12
2979         adcxq   %rax,%r12
2980         adoxq   %rbp,%r13
2981
2982 .byte   0x67,0x67
2983         mulxq   %r8,%r8,%r14
2984         adcxq   %r8,%r13
2985         adcxq   %rbp,%r14
2986
2987         cmpq    8+8(%rsp),%rsi
2988         je      .Lsqrx8x_outer_break
2989
2990         negq    %rcx
2991         movq    $-8,%rcx
2992         movq    %rbp,%r15
2993         movq    64(%rdi),%r8
2994         adcxq   72(%rdi),%r9
2995         adcxq   80(%rdi),%r10
2996         adcxq   88(%rdi),%r11
2997         adcq    96(%rdi),%r12
2998         adcq    104(%rdi),%r13
2999         adcq    112(%rdi),%r14
3000         adcq    120(%rdi),%r15
3001         leaq    (%rsi),%rbp
3002         leaq    128(%rdi),%rdi
3003         sbbq    %rax,%rax
3004
3005         movq    -64(%rsi),%rdx
3006         movq    %rax,16+8(%rsp)
3007         movq    %rdi,24+8(%rsp)
3008
3009
3010         xorl    %eax,%eax
3011         jmp     .Lsqrx8x_loop
3012
3013 .align  32
3014 .Lsqrx8x_loop:
3015         movq    %r8,%rbx
3016         mulxq   0(%rbp),%rax,%r8
3017         adcxq   %rax,%rbx
3018         adoxq   %r9,%r8
3019
3020         mulxq   8(%rbp),%rax,%r9
3021         adcxq   %rax,%r8
3022         adoxq   %r10,%r9
3023
3024         mulxq   16(%rbp),%rax,%r10
3025         adcxq   %rax,%r9
3026         adoxq   %r11,%r10
3027
3028         mulxq   24(%rbp),%rax,%r11
3029         adcxq   %rax,%r10
3030         adoxq   %r12,%r11
3031
3032 .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3033         adcxq   %rax,%r11
3034         adoxq   %r13,%r12
3035
3036         mulxq   40(%rbp),%rax,%r13
3037         adcxq   %rax,%r12
3038         adoxq   %r14,%r13
3039
3040         mulxq   48(%rbp),%rax,%r14
3041         movq    %rbx,(%rdi,%rcx,8)
3042         movl    $0,%ebx
3043         adcxq   %rax,%r13
3044         adoxq   %r15,%r14
3045
3046 .byte   0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3047         movq    8(%rsi,%rcx,8),%rdx
3048         adcxq   %rax,%r14
3049         adoxq   %rbx,%r15
3050         adcxq   %rbx,%r15
3051
3052 .byte   0x67
3053         incq    %rcx
3054         jnz     .Lsqrx8x_loop
3055
3056         leaq    64(%rbp),%rbp
3057         movq    $-8,%rcx
3058         cmpq    8+8(%rsp),%rbp
3059         je      .Lsqrx8x_break
3060
3061         subq    16+8(%rsp),%rbx
3062 .byte   0x66
3063         movq    -64(%rsi),%rdx
3064         adcxq   0(%rdi),%r8
3065         adcxq   8(%rdi),%r9
3066         adcq    16(%rdi),%r10
3067         adcq    24(%rdi),%r11
3068         adcq    32(%rdi),%r12
3069         adcq    40(%rdi),%r13
3070         adcq    48(%rdi),%r14
3071         adcq    56(%rdi),%r15
3072         leaq    64(%rdi),%rdi
3073 .byte   0x67
3074         sbbq    %rax,%rax
3075         xorl    %ebx,%ebx
3076         movq    %rax,16+8(%rsp)
3077         jmp     .Lsqrx8x_loop
3078
3079 .align  32
3080 .Lsqrx8x_break:
3081         xorq    %rbp,%rbp
3082         subq    16+8(%rsp),%rbx
3083         adcxq   %rbp,%r8
3084         movq    24+8(%rsp),%rcx
3085         adcxq   %rbp,%r9
3086         movq    0(%rsi),%rdx
3087         adcq    $0,%r10
3088         movq    %r8,0(%rdi)
3089         adcq    $0,%r11
3090         adcq    $0,%r12
3091         adcq    $0,%r13
3092         adcq    $0,%r14
3093         adcq    $0,%r15
3094         cmpq    %rcx,%rdi
3095         je      .Lsqrx8x_outer_loop
3096
3097         movq    %r9,8(%rdi)
3098         movq    8(%rcx),%r9
3099         movq    %r10,16(%rdi)
3100         movq    16(%rcx),%r10
3101         movq    %r11,24(%rdi)
3102         movq    24(%rcx),%r11
3103         movq    %r12,32(%rdi)
3104         movq    32(%rcx),%r12
3105         movq    %r13,40(%rdi)
3106         movq    40(%rcx),%r13
3107         movq    %r14,48(%rdi)
3108         movq    48(%rcx),%r14
3109         movq    %r15,56(%rdi)
3110         movq    56(%rcx),%r15
3111         movq    %rcx,%rdi
3112         jmp     .Lsqrx8x_outer_loop
3113
3114 .align  32
3115 .Lsqrx8x_outer_break:
3116         movq    %r9,72(%rdi)
3117 .byte   102,72,15,126,217
3118         movq    %r10,80(%rdi)
3119         movq    %r11,88(%rdi)
3120         movq    %r12,96(%rdi)
3121         movq    %r13,104(%rdi)
3122         movq    %r14,112(%rdi)
3123         leaq    48+8(%rsp),%rdi
3124         movq    (%rsi,%rcx,1),%rdx
3125
3126         movq    8(%rdi),%r11
3127         xorq    %r10,%r10
3128         movq    0+8(%rsp),%r9
3129         adoxq   %r11,%r11
3130         movq    16(%rdi),%r12
3131         movq    24(%rdi),%r13
3132
3133
3134 .align  32
3135 .Lsqrx4x_shift_n_add:
3136         mulxq   %rdx,%rax,%rbx
3137         adoxq   %r12,%r12
3138         adcxq   %r10,%rax
3139 .byte   0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3140 .byte   0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3141         adoxq   %r13,%r13
3142         adcxq   %r11,%rbx
3143         movq    40(%rdi),%r11
3144         movq    %rax,0(%rdi)
3145         movq    %rbx,8(%rdi)
3146
3147         mulxq   %rdx,%rax,%rbx
3148         adoxq   %r10,%r10
3149         adcxq   %r12,%rax
3150         movq    16(%rsi,%rcx,1),%rdx
3151         movq    48(%rdi),%r12
3152         adoxq   %r11,%r11
3153         adcxq   %r13,%rbx
3154         movq    56(%rdi),%r13
3155         movq    %rax,16(%rdi)
3156         movq    %rbx,24(%rdi)
3157
3158         mulxq   %rdx,%rax,%rbx
3159         adoxq   %r12,%r12
3160         adcxq   %r10,%rax
3161         movq    24(%rsi,%rcx,1),%rdx
3162         leaq    32(%rcx),%rcx
3163         movq    64(%rdi),%r10
3164         adoxq   %r13,%r13
3165         adcxq   %r11,%rbx
3166         movq    72(%rdi),%r11
3167         movq    %rax,32(%rdi)
3168         movq    %rbx,40(%rdi)
3169
3170         mulxq   %rdx,%rax,%rbx
3171         adoxq   %r10,%r10
3172         adcxq   %r12,%rax
3173         jrcxz   .Lsqrx4x_shift_n_add_break
3174 .byte   0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3175         adoxq   %r11,%r11
3176         adcxq   %r13,%rbx
3177         movq    80(%rdi),%r12
3178         movq    88(%rdi),%r13
3179         movq    %rax,48(%rdi)
3180         movq    %rbx,56(%rdi)
3181         leaq    64(%rdi),%rdi
3182         nop
3183         jmp     .Lsqrx4x_shift_n_add
3184
3185 .align  32
3186 .Lsqrx4x_shift_n_add_break:
3187         adcxq   %r13,%rbx
3188         movq    %rax,48(%rdi)
3189         movq    %rbx,56(%rdi)
3190         leaq    64(%rdi),%rdi
3191 .byte   102,72,15,126,213
3192 __bn_sqrx8x_reduction:
3193         xorl    %eax,%eax
3194         movq    32+8(%rsp),%rbx
3195         movq    48+8(%rsp),%rdx
3196         leaq    -64(%rbp,%r9,1),%rcx
3197
3198         movq    %rcx,0+8(%rsp)
3199         movq    %rdi,8+8(%rsp)
3200
3201         leaq    48+8(%rsp),%rdi
3202         jmp     .Lsqrx8x_reduction_loop
3203
3204 .align  32
3205 .Lsqrx8x_reduction_loop:
3206         movq    8(%rdi),%r9
3207         movq    16(%rdi),%r10
3208         movq    24(%rdi),%r11
3209         movq    32(%rdi),%r12
3210         movq    %rdx,%r8
3211         imulq   %rbx,%rdx
3212         movq    40(%rdi),%r13
3213         movq    48(%rdi),%r14
3214         movq    56(%rdi),%r15
3215         movq    %rax,24+8(%rsp)
3216
3217         leaq    64(%rdi),%rdi
3218         xorq    %rsi,%rsi
3219         movq    $-8,%rcx
3220         jmp     .Lsqrx8x_reduce
3221
3222 .align  32
3223 .Lsqrx8x_reduce:
3224         movq    %r8,%rbx
3225         mulxq   0(%rbp),%rax,%r8
3226         adcxq   %rbx,%rax
3227         adoxq   %r9,%r8
3228
3229         mulxq   8(%rbp),%rbx,%r9
3230         adcxq   %rbx,%r8
3231         adoxq   %r10,%r9
3232
3233         mulxq   16(%rbp),%rbx,%r10
3234         adcxq   %rbx,%r9
3235         adoxq   %r11,%r10
3236
3237         mulxq   24(%rbp),%rbx,%r11
3238         adcxq   %rbx,%r10
3239         adoxq   %r12,%r11
3240
3241 .byte   0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3242         movq    %rdx,%rax
3243         movq    %r8,%rdx
3244         adcxq   %rbx,%r11
3245         adoxq   %r13,%r12
3246
3247         mulxq   32+8(%rsp),%rbx,%rdx
3248         movq    %rax,%rdx
3249         movq    %rax,64+48+8(%rsp,%rcx,8)
3250
3251         mulxq   40(%rbp),%rax,%r13
3252         adcxq   %rax,%r12
3253         adoxq   %r14,%r13
3254
3255         mulxq   48(%rbp),%rax,%r14
3256         adcxq   %rax,%r13
3257         adoxq   %r15,%r14
3258
3259         mulxq   56(%rbp),%rax,%r15
3260         movq    %rbx,%rdx
3261         adcxq   %rax,%r14
3262         adoxq   %rsi,%r15
3263         adcxq   %rsi,%r15
3264
3265 .byte   0x67,0x67,0x67
3266         incq    %rcx
3267         jnz     .Lsqrx8x_reduce
3268
3269         movq    %rsi,%rax
3270         cmpq    0+8(%rsp),%rbp
3271         jae     .Lsqrx8x_no_tail
3272
3273         movq    48+8(%rsp),%rdx
3274         addq    0(%rdi),%r8
3275         leaq    64(%rbp),%rbp
3276         movq    $-8,%rcx
3277         adcxq   8(%rdi),%r9
3278         adcxq   16(%rdi),%r10
3279         adcq    24(%rdi),%r11
3280         adcq    32(%rdi),%r12
3281         adcq    40(%rdi),%r13
3282         adcq    48(%rdi),%r14
3283         adcq    56(%rdi),%r15
3284         leaq    64(%rdi),%rdi
3285         sbbq    %rax,%rax
3286
3287         xorq    %rsi,%rsi
3288         movq    %rax,16+8(%rsp)
3289         jmp     .Lsqrx8x_tail
3290
3291 .align  32
3292 .Lsqrx8x_tail:
3293         movq    %r8,%rbx
3294         mulxq   0(%rbp),%rax,%r8
3295         adcxq   %rax,%rbx
3296         adoxq   %r9,%r8
3297
3298         mulxq   8(%rbp),%rax,%r9
3299         adcxq   %rax,%r8
3300         adoxq   %r10,%r9
3301
3302         mulxq   16(%rbp),%rax,%r10
3303         adcxq   %rax,%r9
3304         adoxq   %r11,%r10
3305
3306         mulxq   24(%rbp),%rax,%r11
3307         adcxq   %rax,%r10
3308         adoxq   %r12,%r11
3309
3310 .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3311         adcxq   %rax,%r11
3312         adoxq   %r13,%r12
3313
3314         mulxq   40(%rbp),%rax,%r13
3315         adcxq   %rax,%r12
3316         adoxq   %r14,%r13
3317
3318         mulxq   48(%rbp),%rax,%r14
3319         adcxq   %rax,%r13
3320         adoxq   %r15,%r14
3321
3322         mulxq   56(%rbp),%rax,%r15
3323         movq    72+48+8(%rsp,%rcx,8),%rdx
3324         adcxq   %rax,%r14
3325         adoxq   %rsi,%r15
3326         movq    %rbx,(%rdi,%rcx,8)
3327         movq    %r8,%rbx
3328         adcxq   %rsi,%r15
3329
3330         incq    %rcx
3331         jnz     .Lsqrx8x_tail
3332
3333         cmpq    0+8(%rsp),%rbp
3334         jae     .Lsqrx8x_tail_done
3335
3336         subq    16+8(%rsp),%rsi
3337         movq    48+8(%rsp),%rdx
3338         leaq    64(%rbp),%rbp
3339         adcq    0(%rdi),%r8
3340         adcq    8(%rdi),%r9
3341         adcq    16(%rdi),%r10
3342         adcq    24(%rdi),%r11
3343         adcq    32(%rdi),%r12
3344         adcq    40(%rdi),%r13
3345         adcq    48(%rdi),%r14
3346         adcq    56(%rdi),%r15
3347         leaq    64(%rdi),%rdi
3348         sbbq    %rax,%rax
3349         subq    $8,%rcx
3350
3351         xorq    %rsi,%rsi
3352         movq    %rax,16+8(%rsp)
3353         jmp     .Lsqrx8x_tail
3354
3355 .align  32
3356 .Lsqrx8x_tail_done:
3357         xorq    %rax,%rax
3358         addq    24+8(%rsp),%r8
3359         adcq    $0,%r9
3360         adcq    $0,%r10
3361         adcq    $0,%r11
3362         adcq    $0,%r12
3363         adcq    $0,%r13
3364         adcq    $0,%r14
3365         adcq    $0,%r15
3366         adcq    $0,%rax
3367
3368         subq    16+8(%rsp),%rsi
3369 .Lsqrx8x_no_tail:
3370         adcq    0(%rdi),%r8
3371 .byte   102,72,15,126,217
3372         adcq    8(%rdi),%r9
3373         movq    56(%rbp),%rsi
3374 .byte   102,72,15,126,213
3375         adcq    16(%rdi),%r10
3376         adcq    24(%rdi),%r11
3377         adcq    32(%rdi),%r12
3378         adcq    40(%rdi),%r13
3379         adcq    48(%rdi),%r14
3380         adcq    56(%rdi),%r15
3381         adcq    $0,%rax
3382
3383         movq    32+8(%rsp),%rbx
3384         movq    64(%rdi,%rcx,1),%rdx
3385
3386         movq    %r8,0(%rdi)
3387         leaq    64(%rdi),%r8
3388         movq    %r9,8(%rdi)
3389         movq    %r10,16(%rdi)
3390         movq    %r11,24(%rdi)
3391         movq    %r12,32(%rdi)
3392         movq    %r13,40(%rdi)
3393         movq    %r14,48(%rdi)
3394         movq    %r15,56(%rdi)
3395
3396         leaq    64(%rdi,%rcx,1),%rdi
3397         cmpq    8+8(%rsp),%r8
3398         jb      .Lsqrx8x_reduction_loop
3399         .byte   0xf3,0xc3
3400 .size   bn_sqrx8x_internal,.-bn_sqrx8x_internal
3401 .align  32
3402 __bn_postx4x_internal:
3403         movq    0(%rbp),%r12
3404         movq    %rcx,%r10
3405         movq    %rcx,%r9
3406         negq    %rax
3407         sarq    $3+2,%rcx
3408
3409 .byte   102,72,15,126,202
3410 .byte   102,72,15,126,206
3411         decq    %r12
3412         movq    8(%rbp),%r13
3413         xorq    %r8,%r8
3414         movq    16(%rbp),%r14
3415         movq    24(%rbp),%r15
3416         jmp     .Lsqrx4x_sub_entry
3417
3418 .align  16
3419 .Lsqrx4x_sub:
3420         movq    0(%rbp),%r12
3421         movq    8(%rbp),%r13
3422         movq    16(%rbp),%r14
3423         movq    24(%rbp),%r15
3424 .Lsqrx4x_sub_entry:
3425         andnq   %rax,%r12,%r12
3426         leaq    32(%rbp),%rbp
3427         andnq   %rax,%r13,%r13
3428         andnq   %rax,%r14,%r14
3429         andnq   %rax,%r15,%r15
3430
3431         negq    %r8
3432         adcq    0(%rdi),%r12
3433         adcq    8(%rdi),%r13
3434         adcq    16(%rdi),%r14
3435         adcq    24(%rdi),%r15
3436         movq    %r12,0(%rdx)
3437         leaq    32(%rdi),%rdi
3438         movq    %r13,8(%rdx)
3439         sbbq    %r8,%r8
3440         movq    %r14,16(%rdx)
3441         movq    %r15,24(%rdx)
3442         leaq    32(%rdx),%rdx
3443
3444         incq    %rcx
3445         jnz     .Lsqrx4x_sub
3446
3447         negq    %r9
3448
3449         .byte   0xf3,0xc3
3450 .size   __bn_postx4x_internal,.-__bn_postx4x_internal
3451 .globl  bn_get_bits5
3452 .type   bn_get_bits5,@function
3453 .align  16
3454 bn_get_bits5:
3455         leaq    0(%rdi),%r10
3456         leaq    1(%rdi),%r11
3457         movl    %esi,%ecx
3458         shrl    $4,%esi
3459         andl    $15,%ecx
3460         leal    -8(%rcx),%eax
3461         cmpl    $11,%ecx
3462         cmovaq  %r11,%r10
3463         cmoval  %eax,%ecx
3464         movzwl  (%r10,%rsi,2),%eax
3465         shrl    %cl,%eax
3466         andl    $31,%eax
3467         .byte   0xf3,0xc3
3468 .size   bn_get_bits5,.-bn_get_bits5
3469
3470 .globl  bn_scatter5
3471 .type   bn_scatter5,@function
3472 .align  16
3473 bn_scatter5:
3474         cmpl    $0,%esi
3475         jz      .Lscatter_epilogue
3476         leaq    (%rdx,%rcx,8),%rdx
3477 .Lscatter:
3478         movq    (%rdi),%rax
3479         leaq    8(%rdi),%rdi
3480         movq    %rax,(%rdx)
3481         leaq    256(%rdx),%rdx
3482         subl    $1,%esi
3483         jnz     .Lscatter
3484 .Lscatter_epilogue:
3485         .byte   0xf3,0xc3
3486 .size   bn_scatter5,.-bn_scatter5
3487
3488 .globl  bn_gather5
3489 .type   bn_gather5,@function
3490 .align  32
3491 bn_gather5:
3492 .LSEH_begin_bn_gather5:
3493
3494 .byte   0x4c,0x8d,0x14,0x24
3495 .byte   0x48,0x81,0xec,0x08,0x01,0x00,0x00
3496         leaq    .Linc(%rip),%rax
3497         andq    $-16,%rsp
3498
3499         movd    %ecx,%xmm5
3500         movdqa  0(%rax),%xmm0
3501         movdqa  16(%rax),%xmm1
3502         leaq    128(%rdx),%r11
3503         leaq    128(%rsp),%rax
3504
3505         pshufd  $0,%xmm5,%xmm5
3506         movdqa  %xmm1,%xmm4
3507         movdqa  %xmm1,%xmm2
3508         paddd   %xmm0,%xmm1
3509         pcmpeqd %xmm5,%xmm0
3510         movdqa  %xmm4,%xmm3
3511
3512         paddd   %xmm1,%xmm2
3513         pcmpeqd %xmm5,%xmm1
3514         movdqa  %xmm0,-128(%rax)
3515         movdqa  %xmm4,%xmm0
3516
3517         paddd   %xmm2,%xmm3
3518         pcmpeqd %xmm5,%xmm2
3519         movdqa  %xmm1,-112(%rax)
3520         movdqa  %xmm4,%xmm1
3521
3522         paddd   %xmm3,%xmm0
3523         pcmpeqd %xmm5,%xmm3
3524         movdqa  %xmm2,-96(%rax)
3525         movdqa  %xmm4,%xmm2
3526         paddd   %xmm0,%xmm1
3527         pcmpeqd %xmm5,%xmm0
3528         movdqa  %xmm3,-80(%rax)
3529         movdqa  %xmm4,%xmm3
3530
3531         paddd   %xmm1,%xmm2
3532         pcmpeqd %xmm5,%xmm1
3533         movdqa  %xmm0,-64(%rax)
3534         movdqa  %xmm4,%xmm0
3535
3536         paddd   %xmm2,%xmm3
3537         pcmpeqd %xmm5,%xmm2
3538         movdqa  %xmm1,-48(%rax)
3539         movdqa  %xmm4,%xmm1
3540
3541         paddd   %xmm3,%xmm0
3542         pcmpeqd %xmm5,%xmm3
3543         movdqa  %xmm2,-32(%rax)
3544         movdqa  %xmm4,%xmm2
3545         paddd   %xmm0,%xmm1
3546         pcmpeqd %xmm5,%xmm0
3547         movdqa  %xmm3,-16(%rax)
3548         movdqa  %xmm4,%xmm3
3549
3550         paddd   %xmm1,%xmm2
3551         pcmpeqd %xmm5,%xmm1
3552         movdqa  %xmm0,0(%rax)
3553         movdqa  %xmm4,%xmm0
3554
3555         paddd   %xmm2,%xmm3
3556         pcmpeqd %xmm5,%xmm2
3557         movdqa  %xmm1,16(%rax)
3558         movdqa  %xmm4,%xmm1
3559
3560         paddd   %xmm3,%xmm0
3561         pcmpeqd %xmm5,%xmm3
3562         movdqa  %xmm2,32(%rax)
3563         movdqa  %xmm4,%xmm2
3564         paddd   %xmm0,%xmm1
3565         pcmpeqd %xmm5,%xmm0
3566         movdqa  %xmm3,48(%rax)
3567         movdqa  %xmm4,%xmm3
3568
3569         paddd   %xmm1,%xmm2
3570         pcmpeqd %xmm5,%xmm1
3571         movdqa  %xmm0,64(%rax)
3572         movdqa  %xmm4,%xmm0
3573
3574         paddd   %xmm2,%xmm3
3575         pcmpeqd %xmm5,%xmm2
3576         movdqa  %xmm1,80(%rax)
3577         movdqa  %xmm4,%xmm1
3578
3579         paddd   %xmm3,%xmm0
3580         pcmpeqd %xmm5,%xmm3
3581         movdqa  %xmm2,96(%rax)
3582         movdqa  %xmm4,%xmm2
3583         movdqa  %xmm3,112(%rax)
3584         jmp     .Lgather
3585
3586 .align  32
3587 .Lgather:
3588         pxor    %xmm4,%xmm4
3589         pxor    %xmm5,%xmm5
3590         movdqa  -128(%r11),%xmm0
3591         movdqa  -112(%r11),%xmm1
3592         movdqa  -96(%r11),%xmm2
3593         pand    -128(%rax),%xmm0
3594         movdqa  -80(%r11),%xmm3
3595         pand    -112(%rax),%xmm1
3596         por     %xmm0,%xmm4
3597         pand    -96(%rax),%xmm2
3598         por     %xmm1,%xmm5
3599         pand    -80(%rax),%xmm3
3600         por     %xmm2,%xmm4
3601         por     %xmm3,%xmm5
3602         movdqa  -64(%r11),%xmm0
3603         movdqa  -48(%r11),%xmm1
3604         movdqa  -32(%r11),%xmm2
3605         pand    -64(%rax),%xmm0
3606         movdqa  -16(%r11),%xmm3
3607         pand    -48(%rax),%xmm1
3608         por     %xmm0,%xmm4
3609         pand    -32(%rax),%xmm2
3610         por     %xmm1,%xmm5
3611         pand    -16(%rax),%xmm3
3612         por     %xmm2,%xmm4
3613         por     %xmm3,%xmm5
3614         movdqa  0(%r11),%xmm0
3615         movdqa  16(%r11),%xmm1
3616         movdqa  32(%r11),%xmm2
3617         pand    0(%rax),%xmm0
3618         movdqa  48(%r11),%xmm3
3619         pand    16(%rax),%xmm1
3620         por     %xmm0,%xmm4
3621         pand    32(%rax),%xmm2
3622         por     %xmm1,%xmm5
3623         pand    48(%rax),%xmm3
3624         por     %xmm2,%xmm4
3625         por     %xmm3,%xmm5
3626         movdqa  64(%r11),%xmm0
3627         movdqa  80(%r11),%xmm1
3628         movdqa  96(%r11),%xmm2
3629         pand    64(%rax),%xmm0
3630         movdqa  112(%r11),%xmm3
3631         pand    80(%rax),%xmm1
3632         por     %xmm0,%xmm4
3633         pand    96(%rax),%xmm2
3634         por     %xmm1,%xmm5
3635         pand    112(%rax),%xmm3
3636         por     %xmm2,%xmm4
3637         por     %xmm3,%xmm5
3638         por     %xmm5,%xmm4
3639         leaq    256(%r11),%r11
3640         pshufd  $0x4e,%xmm4,%xmm0
3641         por     %xmm4,%xmm0
3642         movq    %xmm0,(%rdi)
3643         leaq    8(%rdi),%rdi
3644         subl    $1,%esi
3645         jnz     .Lgather
3646
3647         leaq    (%r10),%rsp
3648         .byte   0xf3,0xc3
3649 .LSEH_end_bn_gather5:
3650 .size   bn_gather5,.-bn_gather5
3651 .align  64
3652 .Linc:
3653 .long   0,0, 1,1
3654 .long   2,2, 2,2
3655 .byte   77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0