]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/amd64/x86_64-mont5.S
MFC: r306193
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / amd64 / x86_64-mont5.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
3 .text   
4
5
6
7 .globl  bn_mul_mont_gather5
8 .type   bn_mul_mont_gather5,@function
9 .align  64
10 bn_mul_mont_gather5:
11         movl    %r9d,%r9d
12         movq    %rsp,%rax
13         testl   $7,%r9d
14         jnz     .Lmul_enter
15         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
16         jmp     .Lmul4x_enter
17
18 .align  16
19 .Lmul_enter:
20         movd    8(%rsp),%xmm5
21         pushq   %rbx
22         pushq   %rbp
23         pushq   %r12
24         pushq   %r13
25         pushq   %r14
26         pushq   %r15
27
28         negq    %r9
29         movq    %rsp,%r11
30         leaq    -280(%rsp,%r9,8),%r10
31         negq    %r9
32         andq    $-1024,%r10
33
34
35
36
37
38
39
40         subq    %r10,%r11
41         andq    $-4096,%r11
42         leaq    (%r10,%r11,1),%rsp
43         movq    (%rsp),%r11
44         cmpq    %r10,%rsp
45         ja      .Lmul_page_walk
46         jmp     .Lmul_page_walk_done
47
48 .Lmul_page_walk:
49         leaq    -4096(%rsp),%rsp
50         movq    (%rsp),%r11
51         cmpq    %r10,%rsp
52         ja      .Lmul_page_walk
53 .Lmul_page_walk_done:
54
55         leaq    .Linc(%rip),%r10
56         movq    %rax,8(%rsp,%r9,8)
57 .Lmul_body:
58
59         leaq    128(%rdx),%r12
60         movdqa  0(%r10),%xmm0
61         movdqa  16(%r10),%xmm1
62         leaq    24-112(%rsp,%r9,8),%r10
63         andq    $-16,%r10
64
65         pshufd  $0,%xmm5,%xmm5
66         movdqa  %xmm1,%xmm4
67         movdqa  %xmm1,%xmm2
68         paddd   %xmm0,%xmm1
69         pcmpeqd %xmm5,%xmm0
70 .byte   0x67
71         movdqa  %xmm4,%xmm3
72         paddd   %xmm1,%xmm2
73         pcmpeqd %xmm5,%xmm1
74         movdqa  %xmm0,112(%r10)
75         movdqa  %xmm4,%xmm0
76
77         paddd   %xmm2,%xmm3
78         pcmpeqd %xmm5,%xmm2
79         movdqa  %xmm1,128(%r10)
80         movdqa  %xmm4,%xmm1
81
82         paddd   %xmm3,%xmm0
83         pcmpeqd %xmm5,%xmm3
84         movdqa  %xmm2,144(%r10)
85         movdqa  %xmm4,%xmm2
86
87         paddd   %xmm0,%xmm1
88         pcmpeqd %xmm5,%xmm0
89         movdqa  %xmm3,160(%r10)
90         movdqa  %xmm4,%xmm3
91         paddd   %xmm1,%xmm2
92         pcmpeqd %xmm5,%xmm1
93         movdqa  %xmm0,176(%r10)
94         movdqa  %xmm4,%xmm0
95
96         paddd   %xmm2,%xmm3
97         pcmpeqd %xmm5,%xmm2
98         movdqa  %xmm1,192(%r10)
99         movdqa  %xmm4,%xmm1
100
101         paddd   %xmm3,%xmm0
102         pcmpeqd %xmm5,%xmm3
103         movdqa  %xmm2,208(%r10)
104         movdqa  %xmm4,%xmm2
105
106         paddd   %xmm0,%xmm1
107         pcmpeqd %xmm5,%xmm0
108         movdqa  %xmm3,224(%r10)
109         movdqa  %xmm4,%xmm3
110         paddd   %xmm1,%xmm2
111         pcmpeqd %xmm5,%xmm1
112         movdqa  %xmm0,240(%r10)
113         movdqa  %xmm4,%xmm0
114
115         paddd   %xmm2,%xmm3
116         pcmpeqd %xmm5,%xmm2
117         movdqa  %xmm1,256(%r10)
118         movdqa  %xmm4,%xmm1
119
120         paddd   %xmm3,%xmm0
121         pcmpeqd %xmm5,%xmm3
122         movdqa  %xmm2,272(%r10)
123         movdqa  %xmm4,%xmm2
124
125         paddd   %xmm0,%xmm1
126         pcmpeqd %xmm5,%xmm0
127         movdqa  %xmm3,288(%r10)
128         movdqa  %xmm4,%xmm3
129         paddd   %xmm1,%xmm2
130         pcmpeqd %xmm5,%xmm1
131         movdqa  %xmm0,304(%r10)
132
133         paddd   %xmm2,%xmm3
134 .byte   0x67
135         pcmpeqd %xmm5,%xmm2
136         movdqa  %xmm1,320(%r10)
137
138         pcmpeqd %xmm5,%xmm3
139         movdqa  %xmm2,336(%r10)
140         pand    64(%r12),%xmm0
141
142         pand    80(%r12),%xmm1
143         pand    96(%r12),%xmm2
144         movdqa  %xmm3,352(%r10)
145         pand    112(%r12),%xmm3
146         por     %xmm2,%xmm0
147         por     %xmm3,%xmm1
148         movdqa  -128(%r12),%xmm4
149         movdqa  -112(%r12),%xmm5
150         movdqa  -96(%r12),%xmm2
151         pand    112(%r10),%xmm4
152         movdqa  -80(%r12),%xmm3
153         pand    128(%r10),%xmm5
154         por     %xmm4,%xmm0
155         pand    144(%r10),%xmm2
156         por     %xmm5,%xmm1
157         pand    160(%r10),%xmm3
158         por     %xmm2,%xmm0
159         por     %xmm3,%xmm1
160         movdqa  -64(%r12),%xmm4
161         movdqa  -48(%r12),%xmm5
162         movdqa  -32(%r12),%xmm2
163         pand    176(%r10),%xmm4
164         movdqa  -16(%r12),%xmm3
165         pand    192(%r10),%xmm5
166         por     %xmm4,%xmm0
167         pand    208(%r10),%xmm2
168         por     %xmm5,%xmm1
169         pand    224(%r10),%xmm3
170         por     %xmm2,%xmm0
171         por     %xmm3,%xmm1
172         movdqa  0(%r12),%xmm4
173         movdqa  16(%r12),%xmm5
174         movdqa  32(%r12),%xmm2
175         pand    240(%r10),%xmm4
176         movdqa  48(%r12),%xmm3
177         pand    256(%r10),%xmm5
178         por     %xmm4,%xmm0
179         pand    272(%r10),%xmm2
180         por     %xmm5,%xmm1
181         pand    288(%r10),%xmm3
182         por     %xmm2,%xmm0
183         por     %xmm3,%xmm1
184         por     %xmm1,%xmm0
185         pshufd  $0x4e,%xmm0,%xmm1
186         por     %xmm1,%xmm0
187         leaq    256(%r12),%r12
188 .byte   102,72,15,126,195
189
190         movq    (%r8),%r8
191         movq    (%rsi),%rax
192
193         xorq    %r14,%r14
194         xorq    %r15,%r15
195
196         movq    %r8,%rbp
197         mulq    %rbx
198         movq    %rax,%r10
199         movq    (%rcx),%rax
200
201         imulq   %r10,%rbp
202         movq    %rdx,%r11
203
204         mulq    %rbp
205         addq    %rax,%r10
206         movq    8(%rsi),%rax
207         adcq    $0,%rdx
208         movq    %rdx,%r13
209
210         leaq    1(%r15),%r15
211         jmp     .L1st_enter
212
213 .align  16
214 .L1st:
215         addq    %rax,%r13
216         movq    (%rsi,%r15,8),%rax
217         adcq    $0,%rdx
218         addq    %r11,%r13
219         movq    %r10,%r11
220         adcq    $0,%rdx
221         movq    %r13,-16(%rsp,%r15,8)
222         movq    %rdx,%r13
223
224 .L1st_enter:
225         mulq    %rbx
226         addq    %rax,%r11
227         movq    (%rcx,%r15,8),%rax
228         adcq    $0,%rdx
229         leaq    1(%r15),%r15
230         movq    %rdx,%r10
231
232         mulq    %rbp
233         cmpq    %r9,%r15
234         jne     .L1st
235
236
237         addq    %rax,%r13
238         adcq    $0,%rdx
239         addq    %r11,%r13
240         adcq    $0,%rdx
241         movq    %r13,-16(%rsp,%r9,8)
242         movq    %rdx,%r13
243         movq    %r10,%r11
244
245         xorq    %rdx,%rdx
246         addq    %r11,%r13
247         adcq    $0,%rdx
248         movq    %r13,-8(%rsp,%r9,8)
249         movq    %rdx,(%rsp,%r9,8)
250
251         leaq    1(%r14),%r14
252         jmp     .Louter
253 .align  16
254 .Louter:
255         leaq    24+128(%rsp,%r9,8),%rdx
256         andq    $-16,%rdx
257         pxor    %xmm4,%xmm4
258         pxor    %xmm5,%xmm5
259         movdqa  -128(%r12),%xmm0
260         movdqa  -112(%r12),%xmm1
261         movdqa  -96(%r12),%xmm2
262         movdqa  -80(%r12),%xmm3
263         pand    -128(%rdx),%xmm0
264         pand    -112(%rdx),%xmm1
265         por     %xmm0,%xmm4
266         pand    -96(%rdx),%xmm2
267         por     %xmm1,%xmm5
268         pand    -80(%rdx),%xmm3
269         por     %xmm2,%xmm4
270         por     %xmm3,%xmm5
271         movdqa  -64(%r12),%xmm0
272         movdqa  -48(%r12),%xmm1
273         movdqa  -32(%r12),%xmm2
274         movdqa  -16(%r12),%xmm3
275         pand    -64(%rdx),%xmm0
276         pand    -48(%rdx),%xmm1
277         por     %xmm0,%xmm4
278         pand    -32(%rdx),%xmm2
279         por     %xmm1,%xmm5
280         pand    -16(%rdx),%xmm3
281         por     %xmm2,%xmm4
282         por     %xmm3,%xmm5
283         movdqa  0(%r12),%xmm0
284         movdqa  16(%r12),%xmm1
285         movdqa  32(%r12),%xmm2
286         movdqa  48(%r12),%xmm3
287         pand    0(%rdx),%xmm0
288         pand    16(%rdx),%xmm1
289         por     %xmm0,%xmm4
290         pand    32(%rdx),%xmm2
291         por     %xmm1,%xmm5
292         pand    48(%rdx),%xmm3
293         por     %xmm2,%xmm4
294         por     %xmm3,%xmm5
295         movdqa  64(%r12),%xmm0
296         movdqa  80(%r12),%xmm1
297         movdqa  96(%r12),%xmm2
298         movdqa  112(%r12),%xmm3
299         pand    64(%rdx),%xmm0
300         pand    80(%rdx),%xmm1
301         por     %xmm0,%xmm4
302         pand    96(%rdx),%xmm2
303         por     %xmm1,%xmm5
304         pand    112(%rdx),%xmm3
305         por     %xmm2,%xmm4
306         por     %xmm3,%xmm5
307         por     %xmm5,%xmm4
308         pshufd  $0x4e,%xmm4,%xmm0
309         por     %xmm4,%xmm0
310         leaq    256(%r12),%r12
311
312         movq    (%rsi),%rax
313 .byte   102,72,15,126,195
314
315         xorq    %r15,%r15
316         movq    %r8,%rbp
317         movq    (%rsp),%r10
318
319         mulq    %rbx
320         addq    %rax,%r10
321         movq    (%rcx),%rax
322         adcq    $0,%rdx
323
324         imulq   %r10,%rbp
325         movq    %rdx,%r11
326
327         mulq    %rbp
328         addq    %rax,%r10
329         movq    8(%rsi),%rax
330         adcq    $0,%rdx
331         movq    8(%rsp),%r10
332         movq    %rdx,%r13
333
334         leaq    1(%r15),%r15
335         jmp     .Linner_enter
336
337 .align  16
338 .Linner:
339         addq    %rax,%r13
340         movq    (%rsi,%r15,8),%rax
341         adcq    $0,%rdx
342         addq    %r10,%r13
343         movq    (%rsp,%r15,8),%r10
344         adcq    $0,%rdx
345         movq    %r13,-16(%rsp,%r15,8)
346         movq    %rdx,%r13
347
348 .Linner_enter:
349         mulq    %rbx
350         addq    %rax,%r11
351         movq    (%rcx,%r15,8),%rax
352         adcq    $0,%rdx
353         addq    %r11,%r10
354         movq    %rdx,%r11
355         adcq    $0,%r11
356         leaq    1(%r15),%r15
357
358         mulq    %rbp
359         cmpq    %r9,%r15
360         jne     .Linner
361
362         addq    %rax,%r13
363         adcq    $0,%rdx
364         addq    %r10,%r13
365         movq    (%rsp,%r9,8),%r10
366         adcq    $0,%rdx
367         movq    %r13,-16(%rsp,%r9,8)
368         movq    %rdx,%r13
369
370         xorq    %rdx,%rdx
371         addq    %r11,%r13
372         adcq    $0,%rdx
373         addq    %r10,%r13
374         adcq    $0,%rdx
375         movq    %r13,-8(%rsp,%r9,8)
376         movq    %rdx,(%rsp,%r9,8)
377
378         leaq    1(%r14),%r14
379         cmpq    %r9,%r14
380         jb      .Louter
381
382         xorq    %r14,%r14
383         movq    (%rsp),%rax
384         leaq    (%rsp),%rsi
385         movq    %r9,%r15
386         jmp     .Lsub
387 .align  16
388 .Lsub:  sbbq    (%rcx,%r14,8),%rax
389         movq    %rax,(%rdi,%r14,8)
390         movq    8(%rsi,%r14,8),%rax
391         leaq    1(%r14),%r14
392         decq    %r15
393         jnz     .Lsub
394
395         sbbq    $0,%rax
396         xorq    %r14,%r14
397         andq    %rax,%rsi
398         notq    %rax
399         movq    %rdi,%rcx
400         andq    %rax,%rcx
401         movq    %r9,%r15
402         orq     %rcx,%rsi
403 .align  16
404 .Lcopy:
405         movq    (%rsi,%r14,8),%rax
406         movq    %r14,(%rsp,%r14,8)
407         movq    %rax,(%rdi,%r14,8)
408         leaq    1(%r14),%r14
409         subq    $1,%r15
410         jnz     .Lcopy
411
412         movq    8(%rsp,%r9,8),%rsi
413         movq    $1,%rax
414
415         movq    -48(%rsi),%r15
416         movq    -40(%rsi),%r14
417         movq    -32(%rsi),%r13
418         movq    -24(%rsi),%r12
419         movq    -16(%rsi),%rbp
420         movq    -8(%rsi),%rbx
421         leaq    (%rsi),%rsp
422 .Lmul_epilogue:
423         .byte   0xf3,0xc3
424 .size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
425 .type   bn_mul4x_mont_gather5,@function
426 .align  32
427 bn_mul4x_mont_gather5:
428 .byte   0x67
429         movq    %rsp,%rax
430 .Lmul4x_enter:
431         andl    $0x80108,%r11d
432         cmpl    $0x80108,%r11d
433         je      .Lmulx4x_enter
434         pushq   %rbx
435         pushq   %rbp
436         pushq   %r12
437         pushq   %r13
438         pushq   %r14
439         pushq   %r15
440 .Lmul4x_prologue:
441
442 .byte   0x67
443         shll    $3,%r9d
444         leaq    (%r9,%r9,2),%r10
445         negq    %r9
446
447
448
449
450
451
452
453
454
455
456         leaq    -320(%rsp,%r9,2),%r11
457         movq    %rsp,%rbp
458         subq    %rdi,%r11
459         andq    $4095,%r11
460         cmpq    %r11,%r10
461         jb      .Lmul4xsp_alt
462         subq    %r11,%rbp
463         leaq    -320(%rbp,%r9,2),%rbp
464         jmp     .Lmul4xsp_done
465
466 .align  32
467 .Lmul4xsp_alt:
468         leaq    4096-320(,%r9,2),%r10
469         leaq    -320(%rbp,%r9,2),%rbp
470         subq    %r10,%r11
471         movq    $0,%r10
472         cmovcq  %r10,%r11
473         subq    %r11,%rbp
474 .Lmul4xsp_done:
475         andq    $-64,%rbp
476         movq    %rsp,%r11
477         subq    %rbp,%r11
478         andq    $-4096,%r11
479         leaq    (%r11,%rbp,1),%rsp
480         movq    (%rsp),%r10
481         cmpq    %rbp,%rsp
482         ja      .Lmul4x_page_walk
483         jmp     .Lmul4x_page_walk_done
484
485 .Lmul4x_page_walk:
486         leaq    -4096(%rsp),%rsp
487         movq    (%rsp),%r10
488         cmpq    %rbp,%rsp
489         ja      .Lmul4x_page_walk
490 .Lmul4x_page_walk_done:
491
492         negq    %r9
493
494         movq    %rax,40(%rsp)
495 .Lmul4x_body:
496
497         call    mul4x_internal
498
499         movq    40(%rsp),%rsi
500         movq    $1,%rax
501
502         movq    -48(%rsi),%r15
503         movq    -40(%rsi),%r14
504         movq    -32(%rsi),%r13
505         movq    -24(%rsi),%r12
506         movq    -16(%rsi),%rbp
507         movq    -8(%rsi),%rbx
508         leaq    (%rsi),%rsp
509 .Lmul4x_epilogue:
510         .byte   0xf3,0xc3
511 .size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
512
513 .type   mul4x_internal,@function
514 .align  32
515 mul4x_internal:
516         shlq    $5,%r9
517         movd    8(%rax),%xmm5
518         leaq    .Linc(%rip),%rax
519         leaq    128(%rdx,%r9,1),%r13
520         shrq    $5,%r9
521         movdqa  0(%rax),%xmm0
522         movdqa  16(%rax),%xmm1
523         leaq    88-112(%rsp,%r9,1),%r10
524         leaq    128(%rdx),%r12
525
526         pshufd  $0,%xmm5,%xmm5
527         movdqa  %xmm1,%xmm4
528 .byte   0x67,0x67
529         movdqa  %xmm1,%xmm2
530         paddd   %xmm0,%xmm1
531         pcmpeqd %xmm5,%xmm0
532 .byte   0x67
533         movdqa  %xmm4,%xmm3
534         paddd   %xmm1,%xmm2
535         pcmpeqd %xmm5,%xmm1
536         movdqa  %xmm0,112(%r10)
537         movdqa  %xmm4,%xmm0
538
539         paddd   %xmm2,%xmm3
540         pcmpeqd %xmm5,%xmm2
541         movdqa  %xmm1,128(%r10)
542         movdqa  %xmm4,%xmm1
543
544         paddd   %xmm3,%xmm0
545         pcmpeqd %xmm5,%xmm3
546         movdqa  %xmm2,144(%r10)
547         movdqa  %xmm4,%xmm2
548
549         paddd   %xmm0,%xmm1
550         pcmpeqd %xmm5,%xmm0
551         movdqa  %xmm3,160(%r10)
552         movdqa  %xmm4,%xmm3
553         paddd   %xmm1,%xmm2
554         pcmpeqd %xmm5,%xmm1
555         movdqa  %xmm0,176(%r10)
556         movdqa  %xmm4,%xmm0
557
558         paddd   %xmm2,%xmm3
559         pcmpeqd %xmm5,%xmm2
560         movdqa  %xmm1,192(%r10)
561         movdqa  %xmm4,%xmm1
562
563         paddd   %xmm3,%xmm0
564         pcmpeqd %xmm5,%xmm3
565         movdqa  %xmm2,208(%r10)
566         movdqa  %xmm4,%xmm2
567
568         paddd   %xmm0,%xmm1
569         pcmpeqd %xmm5,%xmm0
570         movdqa  %xmm3,224(%r10)
571         movdqa  %xmm4,%xmm3
572         paddd   %xmm1,%xmm2
573         pcmpeqd %xmm5,%xmm1
574         movdqa  %xmm0,240(%r10)
575         movdqa  %xmm4,%xmm0
576
577         paddd   %xmm2,%xmm3
578         pcmpeqd %xmm5,%xmm2
579         movdqa  %xmm1,256(%r10)
580         movdqa  %xmm4,%xmm1
581
582         paddd   %xmm3,%xmm0
583         pcmpeqd %xmm5,%xmm3
584         movdqa  %xmm2,272(%r10)
585         movdqa  %xmm4,%xmm2
586
587         paddd   %xmm0,%xmm1
588         pcmpeqd %xmm5,%xmm0
589         movdqa  %xmm3,288(%r10)
590         movdqa  %xmm4,%xmm3
591         paddd   %xmm1,%xmm2
592         pcmpeqd %xmm5,%xmm1
593         movdqa  %xmm0,304(%r10)
594
595         paddd   %xmm2,%xmm3
596 .byte   0x67
597         pcmpeqd %xmm5,%xmm2
598         movdqa  %xmm1,320(%r10)
599
600         pcmpeqd %xmm5,%xmm3
601         movdqa  %xmm2,336(%r10)
602         pand    64(%r12),%xmm0
603
604         pand    80(%r12),%xmm1
605         pand    96(%r12),%xmm2
606         movdqa  %xmm3,352(%r10)
607         pand    112(%r12),%xmm3
608         por     %xmm2,%xmm0
609         por     %xmm3,%xmm1
610         movdqa  -128(%r12),%xmm4
611         movdqa  -112(%r12),%xmm5
612         movdqa  -96(%r12),%xmm2
613         pand    112(%r10),%xmm4
614         movdqa  -80(%r12),%xmm3
615         pand    128(%r10),%xmm5
616         por     %xmm4,%xmm0
617         pand    144(%r10),%xmm2
618         por     %xmm5,%xmm1
619         pand    160(%r10),%xmm3
620         por     %xmm2,%xmm0
621         por     %xmm3,%xmm1
622         movdqa  -64(%r12),%xmm4
623         movdqa  -48(%r12),%xmm5
624         movdqa  -32(%r12),%xmm2
625         pand    176(%r10),%xmm4
626         movdqa  -16(%r12),%xmm3
627         pand    192(%r10),%xmm5
628         por     %xmm4,%xmm0
629         pand    208(%r10),%xmm2
630         por     %xmm5,%xmm1
631         pand    224(%r10),%xmm3
632         por     %xmm2,%xmm0
633         por     %xmm3,%xmm1
634         movdqa  0(%r12),%xmm4
635         movdqa  16(%r12),%xmm5
636         movdqa  32(%r12),%xmm2
637         pand    240(%r10),%xmm4
638         movdqa  48(%r12),%xmm3
639         pand    256(%r10),%xmm5
640         por     %xmm4,%xmm0
641         pand    272(%r10),%xmm2
642         por     %xmm5,%xmm1
643         pand    288(%r10),%xmm3
644         por     %xmm2,%xmm0
645         por     %xmm3,%xmm1
646         por     %xmm1,%xmm0
647         pshufd  $0x4e,%xmm0,%xmm1
648         por     %xmm1,%xmm0
649         leaq    256(%r12),%r12
650 .byte   102,72,15,126,195
651
652         movq    %r13,16+8(%rsp)
653         movq    %rdi,56+8(%rsp)
654
655         movq    (%r8),%r8
656         movq    (%rsi),%rax
657         leaq    (%rsi,%r9,1),%rsi
658         negq    %r9
659
660         movq    %r8,%rbp
661         mulq    %rbx
662         movq    %rax,%r10
663         movq    (%rcx),%rax
664
665         imulq   %r10,%rbp
666         leaq    64+8(%rsp),%r14
667         movq    %rdx,%r11
668
669         mulq    %rbp
670         addq    %rax,%r10
671         movq    8(%rsi,%r9,1),%rax
672         adcq    $0,%rdx
673         movq    %rdx,%rdi
674
675         mulq    %rbx
676         addq    %rax,%r11
677         movq    8(%rcx),%rax
678         adcq    $0,%rdx
679         movq    %rdx,%r10
680
681         mulq    %rbp
682         addq    %rax,%rdi
683         movq    16(%rsi,%r9,1),%rax
684         adcq    $0,%rdx
685         addq    %r11,%rdi
686         leaq    32(%r9),%r15
687         leaq    32(%rcx),%rcx
688         adcq    $0,%rdx
689         movq    %rdi,(%r14)
690         movq    %rdx,%r13
691         jmp     .L1st4x
692
693 .align  32
694 .L1st4x:
695         mulq    %rbx
696         addq    %rax,%r10
697         movq    -16(%rcx),%rax
698         leaq    32(%r14),%r14
699         adcq    $0,%rdx
700         movq    %rdx,%r11
701
702         mulq    %rbp
703         addq    %rax,%r13
704         movq    -8(%rsi,%r15,1),%rax
705         adcq    $0,%rdx
706         addq    %r10,%r13
707         adcq    $0,%rdx
708         movq    %r13,-24(%r14)
709         movq    %rdx,%rdi
710
711         mulq    %rbx
712         addq    %rax,%r11
713         movq    -8(%rcx),%rax
714         adcq    $0,%rdx
715         movq    %rdx,%r10
716
717         mulq    %rbp
718         addq    %rax,%rdi
719         movq    (%rsi,%r15,1),%rax
720         adcq    $0,%rdx
721         addq    %r11,%rdi
722         adcq    $0,%rdx
723         movq    %rdi,-16(%r14)
724         movq    %rdx,%r13
725
726         mulq    %rbx
727         addq    %rax,%r10
728         movq    0(%rcx),%rax
729         adcq    $0,%rdx
730         movq    %rdx,%r11
731
732         mulq    %rbp
733         addq    %rax,%r13
734         movq    8(%rsi,%r15,1),%rax
735         adcq    $0,%rdx
736         addq    %r10,%r13
737         adcq    $0,%rdx
738         movq    %r13,-8(%r14)
739         movq    %rdx,%rdi
740
741         mulq    %rbx
742         addq    %rax,%r11
743         movq    8(%rcx),%rax
744         adcq    $0,%rdx
745         movq    %rdx,%r10
746
747         mulq    %rbp
748         addq    %rax,%rdi
749         movq    16(%rsi,%r15,1),%rax
750         adcq    $0,%rdx
751         addq    %r11,%rdi
752         leaq    32(%rcx),%rcx
753         adcq    $0,%rdx
754         movq    %rdi,(%r14)
755         movq    %rdx,%r13
756
757         addq    $32,%r15
758         jnz     .L1st4x
759
760         mulq    %rbx
761         addq    %rax,%r10
762         movq    -16(%rcx),%rax
763         leaq    32(%r14),%r14
764         adcq    $0,%rdx
765         movq    %rdx,%r11
766
767         mulq    %rbp
768         addq    %rax,%r13
769         movq    -8(%rsi),%rax
770         adcq    $0,%rdx
771         addq    %r10,%r13
772         adcq    $0,%rdx
773         movq    %r13,-24(%r14)
774         movq    %rdx,%rdi
775
776         mulq    %rbx
777         addq    %rax,%r11
778         movq    -8(%rcx),%rax
779         adcq    $0,%rdx
780         movq    %rdx,%r10
781
782         mulq    %rbp
783         addq    %rax,%rdi
784         movq    (%rsi,%r9,1),%rax
785         adcq    $0,%rdx
786         addq    %r11,%rdi
787         adcq    $0,%rdx
788         movq    %rdi,-16(%r14)
789         movq    %rdx,%r13
790
791         leaq    (%rcx,%r9,1),%rcx
792
793         xorq    %rdi,%rdi
794         addq    %r10,%r13
795         adcq    $0,%rdi
796         movq    %r13,-8(%r14)
797
798         jmp     .Louter4x
799
800 .align  32
801 .Louter4x:
802         leaq    16+128(%r14),%rdx
803         pxor    %xmm4,%xmm4
804         pxor    %xmm5,%xmm5
805         movdqa  -128(%r12),%xmm0
806         movdqa  -112(%r12),%xmm1
807         movdqa  -96(%r12),%xmm2
808         movdqa  -80(%r12),%xmm3
809         pand    -128(%rdx),%xmm0
810         pand    -112(%rdx),%xmm1
811         por     %xmm0,%xmm4
812         pand    -96(%rdx),%xmm2
813         por     %xmm1,%xmm5
814         pand    -80(%rdx),%xmm3
815         por     %xmm2,%xmm4
816         por     %xmm3,%xmm5
817         movdqa  -64(%r12),%xmm0
818         movdqa  -48(%r12),%xmm1
819         movdqa  -32(%r12),%xmm2
820         movdqa  -16(%r12),%xmm3
821         pand    -64(%rdx),%xmm0
822         pand    -48(%rdx),%xmm1
823         por     %xmm0,%xmm4
824         pand    -32(%rdx),%xmm2
825         por     %xmm1,%xmm5
826         pand    -16(%rdx),%xmm3
827         por     %xmm2,%xmm4
828         por     %xmm3,%xmm5
829         movdqa  0(%r12),%xmm0
830         movdqa  16(%r12),%xmm1
831         movdqa  32(%r12),%xmm2
832         movdqa  48(%r12),%xmm3
833         pand    0(%rdx),%xmm0
834         pand    16(%rdx),%xmm1
835         por     %xmm0,%xmm4
836         pand    32(%rdx),%xmm2
837         por     %xmm1,%xmm5
838         pand    48(%rdx),%xmm3
839         por     %xmm2,%xmm4
840         por     %xmm3,%xmm5
841         movdqa  64(%r12),%xmm0
842         movdqa  80(%r12),%xmm1
843         movdqa  96(%r12),%xmm2
844         movdqa  112(%r12),%xmm3
845         pand    64(%rdx),%xmm0
846         pand    80(%rdx),%xmm1
847         por     %xmm0,%xmm4
848         pand    96(%rdx),%xmm2
849         por     %xmm1,%xmm5
850         pand    112(%rdx),%xmm3
851         por     %xmm2,%xmm4
852         por     %xmm3,%xmm5
853         por     %xmm5,%xmm4
854         pshufd  $0x4e,%xmm4,%xmm0
855         por     %xmm4,%xmm0
856         leaq    256(%r12),%r12
857 .byte   102,72,15,126,195
858
859         movq    (%r14,%r9,1),%r10
860         movq    %r8,%rbp
861         mulq    %rbx
862         addq    %rax,%r10
863         movq    (%rcx),%rax
864         adcq    $0,%rdx
865
866         imulq   %r10,%rbp
867         movq    %rdx,%r11
868         movq    %rdi,(%r14)
869
870         leaq    (%r14,%r9,1),%r14
871
872         mulq    %rbp
873         addq    %rax,%r10
874         movq    8(%rsi,%r9,1),%rax
875         adcq    $0,%rdx
876         movq    %rdx,%rdi
877
878         mulq    %rbx
879         addq    %rax,%r11
880         movq    8(%rcx),%rax
881         adcq    $0,%rdx
882         addq    8(%r14),%r11
883         adcq    $0,%rdx
884         movq    %rdx,%r10
885
886         mulq    %rbp
887         addq    %rax,%rdi
888         movq    16(%rsi,%r9,1),%rax
889         adcq    $0,%rdx
890         addq    %r11,%rdi
891         leaq    32(%r9),%r15
892         leaq    32(%rcx),%rcx
893         adcq    $0,%rdx
894         movq    %rdx,%r13
895         jmp     .Linner4x
896
897 .align  32
898 .Linner4x:
899         mulq    %rbx
900         addq    %rax,%r10
901         movq    -16(%rcx),%rax
902         adcq    $0,%rdx
903         addq    16(%r14),%r10
904         leaq    32(%r14),%r14
905         adcq    $0,%rdx
906         movq    %rdx,%r11
907
908         mulq    %rbp
909         addq    %rax,%r13
910         movq    -8(%rsi,%r15,1),%rax
911         adcq    $0,%rdx
912         addq    %r10,%r13
913         adcq    $0,%rdx
914         movq    %rdi,-32(%r14)
915         movq    %rdx,%rdi
916
917         mulq    %rbx
918         addq    %rax,%r11
919         movq    -8(%rcx),%rax
920         adcq    $0,%rdx
921         addq    -8(%r14),%r11
922         adcq    $0,%rdx
923         movq    %rdx,%r10
924
925         mulq    %rbp
926         addq    %rax,%rdi
927         movq    (%rsi,%r15,1),%rax
928         adcq    $0,%rdx
929         addq    %r11,%rdi
930         adcq    $0,%rdx
931         movq    %r13,-24(%r14)
932         movq    %rdx,%r13
933
934         mulq    %rbx
935         addq    %rax,%r10
936         movq    0(%rcx),%rax
937         adcq    $0,%rdx
938         addq    (%r14),%r10
939         adcq    $0,%rdx
940         movq    %rdx,%r11
941
942         mulq    %rbp
943         addq    %rax,%r13
944         movq    8(%rsi,%r15,1),%rax
945         adcq    $0,%rdx
946         addq    %r10,%r13
947         adcq    $0,%rdx
948         movq    %rdi,-16(%r14)
949         movq    %rdx,%rdi
950
951         mulq    %rbx
952         addq    %rax,%r11
953         movq    8(%rcx),%rax
954         adcq    $0,%rdx
955         addq    8(%r14),%r11
956         adcq    $0,%rdx
957         movq    %rdx,%r10
958
959         mulq    %rbp
960         addq    %rax,%rdi
961         movq    16(%rsi,%r15,1),%rax
962         adcq    $0,%rdx
963         addq    %r11,%rdi
964         leaq    32(%rcx),%rcx
965         adcq    $0,%rdx
966         movq    %r13,-8(%r14)
967         movq    %rdx,%r13
968
969         addq    $32,%r15
970         jnz     .Linner4x
971
972         mulq    %rbx
973         addq    %rax,%r10
974         movq    -16(%rcx),%rax
975         adcq    $0,%rdx
976         addq    16(%r14),%r10
977         leaq    32(%r14),%r14
978         adcq    $0,%rdx
979         movq    %rdx,%r11
980
981         mulq    %rbp
982         addq    %rax,%r13
983         movq    -8(%rsi),%rax
984         adcq    $0,%rdx
985         addq    %r10,%r13
986         adcq    $0,%rdx
987         movq    %rdi,-32(%r14)
988         movq    %rdx,%rdi
989
990         mulq    %rbx
991         addq    %rax,%r11
992         movq    %rbp,%rax
993         movq    -8(%rcx),%rbp
994         adcq    $0,%rdx
995         addq    -8(%r14),%r11
996         adcq    $0,%rdx
997         movq    %rdx,%r10
998
999         mulq    %rbp
1000         addq    %rax,%rdi
1001         movq    (%rsi,%r9,1),%rax
1002         adcq    $0,%rdx
1003         addq    %r11,%rdi
1004         adcq    $0,%rdx
1005         movq    %r13,-24(%r14)
1006         movq    %rdx,%r13
1007
1008         movq    %rdi,-16(%r14)
1009         leaq    (%rcx,%r9,1),%rcx
1010
1011         xorq    %rdi,%rdi
1012         addq    %r10,%r13
1013         adcq    $0,%rdi
1014         addq    (%r14),%r13
1015         adcq    $0,%rdi
1016         movq    %r13,-8(%r14)
1017
1018         cmpq    16+8(%rsp),%r12
1019         jb      .Louter4x
1020         xorq    %rax,%rax
1021         subq    %r13,%rbp
1022         adcq    %r15,%r15
1023         orq     %r15,%rdi
1024         subq    %rdi,%rax
1025         leaq    (%r14,%r9,1),%rbx
1026         movq    (%rcx),%r12
1027         leaq    (%rcx),%rbp
1028         movq    %r9,%rcx
1029         sarq    $3+2,%rcx
1030         movq    56+8(%rsp),%rdi
1031         decq    %r12
1032         xorq    %r10,%r10
1033         movq    8(%rbp),%r13
1034         movq    16(%rbp),%r14
1035         movq    24(%rbp),%r15
1036         jmp     .Lsqr4x_sub_entry
1037 .size   mul4x_internal,.-mul4x_internal
1038 .globl  bn_power5
1039 .type   bn_power5,@function
1040 .align  32
1041 bn_power5:
1042         movq    %rsp,%rax
1043         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
1044         andl    $0x80108,%r11d
1045         cmpl    $0x80108,%r11d
1046         je      .Lpowerx5_enter
1047         pushq   %rbx
1048         pushq   %rbp
1049         pushq   %r12
1050         pushq   %r13
1051         pushq   %r14
1052         pushq   %r15
1053 .Lpower5_prologue:
1054
1055         shll    $3,%r9d
1056         leal    (%r9,%r9,2),%r10d
1057         negq    %r9
1058         movq    (%r8),%r8
1059
1060
1061
1062
1063
1064
1065
1066
1067         leaq    -320(%rsp,%r9,2),%r11
1068         movq    %rsp,%rbp
1069         subq    %rdi,%r11
1070         andq    $4095,%r11
1071         cmpq    %r11,%r10
1072         jb      .Lpwr_sp_alt
1073         subq    %r11,%rbp
1074         leaq    -320(%rbp,%r9,2),%rbp
1075         jmp     .Lpwr_sp_done
1076
1077 .align  32
1078 .Lpwr_sp_alt:
1079         leaq    4096-320(,%r9,2),%r10
1080         leaq    -320(%rbp,%r9,2),%rbp
1081         subq    %r10,%r11
1082         movq    $0,%r10
1083         cmovcq  %r10,%r11
1084         subq    %r11,%rbp
1085 .Lpwr_sp_done:
1086         andq    $-64,%rbp
1087         movq    %rsp,%r11
1088         subq    %rbp,%r11
1089         andq    $-4096,%r11
1090         leaq    (%r11,%rbp,1),%rsp
1091         movq    (%rsp),%r10
1092         cmpq    %rbp,%rsp
1093         ja      .Lpwr_page_walk
1094         jmp     .Lpwr_page_walk_done
1095
1096 .Lpwr_page_walk:
1097         leaq    -4096(%rsp),%rsp
1098         movq    (%rsp),%r10
1099         cmpq    %rbp,%rsp
1100         ja      .Lpwr_page_walk
1101 .Lpwr_page_walk_done:
1102
1103         movq    %r9,%r10
1104         negq    %r9
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115         movq    %r8,32(%rsp)
1116         movq    %rax,40(%rsp)
1117 .Lpower5_body:
1118 .byte   102,72,15,110,207
1119 .byte   102,72,15,110,209
1120 .byte   102,73,15,110,218
1121 .byte   102,72,15,110,226
1122
1123         call    __bn_sqr8x_internal
1124         call    __bn_post4x_internal
1125         call    __bn_sqr8x_internal
1126         call    __bn_post4x_internal
1127         call    __bn_sqr8x_internal
1128         call    __bn_post4x_internal
1129         call    __bn_sqr8x_internal
1130         call    __bn_post4x_internal
1131         call    __bn_sqr8x_internal
1132         call    __bn_post4x_internal
1133
1134 .byte   102,72,15,126,209
1135 .byte   102,72,15,126,226
1136         movq    %rsi,%rdi
1137         movq    40(%rsp),%rax
1138         leaq    32(%rsp),%r8
1139
1140         call    mul4x_internal
1141
1142         movq    40(%rsp),%rsi
1143         movq    $1,%rax
1144         movq    -48(%rsi),%r15
1145         movq    -40(%rsi),%r14
1146         movq    -32(%rsi),%r13
1147         movq    -24(%rsi),%r12
1148         movq    -16(%rsi),%rbp
1149         movq    -8(%rsi),%rbx
1150         leaq    (%rsi),%rsp
1151 .Lpower5_epilogue:
1152         .byte   0xf3,0xc3
1153 .size   bn_power5,.-bn_power5
1154
1155 .globl  bn_sqr8x_internal
1156 .hidden bn_sqr8x_internal
1157 .type   bn_sqr8x_internal,@function
1158 .align  32
1159 bn_sqr8x_internal:
1160 __bn_sqr8x_internal:
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234         leaq    32(%r10),%rbp
1235         leaq    (%rsi,%r9,1),%rsi
1236
1237         movq    %r9,%rcx
1238
1239
1240         movq    -32(%rsi,%rbp,1),%r14
1241         leaq    48+8(%rsp,%r9,2),%rdi
1242         movq    -24(%rsi,%rbp,1),%rax
1243         leaq    -32(%rdi,%rbp,1),%rdi
1244         movq    -16(%rsi,%rbp,1),%rbx
1245         movq    %rax,%r15
1246
1247         mulq    %r14
1248         movq    %rax,%r10
1249         movq    %rbx,%rax
1250         movq    %rdx,%r11
1251         movq    %r10,-24(%rdi,%rbp,1)
1252
1253         mulq    %r14
1254         addq    %rax,%r11
1255         movq    %rbx,%rax
1256         adcq    $0,%rdx
1257         movq    %r11,-16(%rdi,%rbp,1)
1258         movq    %rdx,%r10
1259
1260
1261         movq    -8(%rsi,%rbp,1),%rbx
1262         mulq    %r15
1263         movq    %rax,%r12
1264         movq    %rbx,%rax
1265         movq    %rdx,%r13
1266
1267         leaq    (%rbp),%rcx
1268         mulq    %r14
1269         addq    %rax,%r10
1270         movq    %rbx,%rax
1271         movq    %rdx,%r11
1272         adcq    $0,%r11
1273         addq    %r12,%r10
1274         adcq    $0,%r11
1275         movq    %r10,-8(%rdi,%rcx,1)
1276         jmp     .Lsqr4x_1st
1277
1278 .align  32
1279 .Lsqr4x_1st:
1280         movq    (%rsi,%rcx,1),%rbx
1281         mulq    %r15
1282         addq    %rax,%r13
1283         movq    %rbx,%rax
1284         movq    %rdx,%r12
1285         adcq    $0,%r12
1286
1287         mulq    %r14
1288         addq    %rax,%r11
1289         movq    %rbx,%rax
1290         movq    8(%rsi,%rcx,1),%rbx
1291         movq    %rdx,%r10
1292         adcq    $0,%r10
1293         addq    %r13,%r11
1294         adcq    $0,%r10
1295
1296
1297         mulq    %r15
1298         addq    %rax,%r12
1299         movq    %rbx,%rax
1300         movq    %r11,(%rdi,%rcx,1)
1301         movq    %rdx,%r13
1302         adcq    $0,%r13
1303
1304         mulq    %r14
1305         addq    %rax,%r10
1306         movq    %rbx,%rax
1307         movq    16(%rsi,%rcx,1),%rbx
1308         movq    %rdx,%r11
1309         adcq    $0,%r11
1310         addq    %r12,%r10
1311         adcq    $0,%r11
1312
1313         mulq    %r15
1314         addq    %rax,%r13
1315         movq    %rbx,%rax
1316         movq    %r10,8(%rdi,%rcx,1)
1317         movq    %rdx,%r12
1318         adcq    $0,%r12
1319
1320         mulq    %r14
1321         addq    %rax,%r11
1322         movq    %rbx,%rax
1323         movq    24(%rsi,%rcx,1),%rbx
1324         movq    %rdx,%r10
1325         adcq    $0,%r10
1326         addq    %r13,%r11
1327         adcq    $0,%r10
1328
1329
1330         mulq    %r15
1331         addq    %rax,%r12
1332         movq    %rbx,%rax
1333         movq    %r11,16(%rdi,%rcx,1)
1334         movq    %rdx,%r13
1335         adcq    $0,%r13
1336         leaq    32(%rcx),%rcx
1337
1338         mulq    %r14
1339         addq    %rax,%r10
1340         movq    %rbx,%rax
1341         movq    %rdx,%r11
1342         adcq    $0,%r11
1343         addq    %r12,%r10
1344         adcq    $0,%r11
1345         movq    %r10,-8(%rdi,%rcx,1)
1346
1347         cmpq    $0,%rcx
1348         jne     .Lsqr4x_1st
1349
1350         mulq    %r15
1351         addq    %rax,%r13
1352         leaq    16(%rbp),%rbp
1353         adcq    $0,%rdx
1354         addq    %r11,%r13
1355         adcq    $0,%rdx
1356
1357         movq    %r13,(%rdi)
1358         movq    %rdx,%r12
1359         movq    %rdx,8(%rdi)
1360         jmp     .Lsqr4x_outer
1361
1362 .align  32
1363 .Lsqr4x_outer:
1364         movq    -32(%rsi,%rbp,1),%r14
1365         leaq    48+8(%rsp,%r9,2),%rdi
1366         movq    -24(%rsi,%rbp,1),%rax
1367         leaq    -32(%rdi,%rbp,1),%rdi
1368         movq    -16(%rsi,%rbp,1),%rbx
1369         movq    %rax,%r15
1370
1371         mulq    %r14
1372         movq    -24(%rdi,%rbp,1),%r10
1373         addq    %rax,%r10
1374         movq    %rbx,%rax
1375         adcq    $0,%rdx
1376         movq    %r10,-24(%rdi,%rbp,1)
1377         movq    %rdx,%r11
1378
1379         mulq    %r14
1380         addq    %rax,%r11
1381         movq    %rbx,%rax
1382         adcq    $0,%rdx
1383         addq    -16(%rdi,%rbp,1),%r11
1384         movq    %rdx,%r10
1385         adcq    $0,%r10
1386         movq    %r11,-16(%rdi,%rbp,1)
1387
1388         xorq    %r12,%r12
1389
1390         movq    -8(%rsi,%rbp,1),%rbx
1391         mulq    %r15
1392         addq    %rax,%r12
1393         movq    %rbx,%rax
1394         adcq    $0,%rdx
1395         addq    -8(%rdi,%rbp,1),%r12
1396         movq    %rdx,%r13
1397         adcq    $0,%r13
1398
1399         mulq    %r14
1400         addq    %rax,%r10
1401         movq    %rbx,%rax
1402         adcq    $0,%rdx
1403         addq    %r12,%r10
1404         movq    %rdx,%r11
1405         adcq    $0,%r11
1406         movq    %r10,-8(%rdi,%rbp,1)
1407
1408         leaq    (%rbp),%rcx
1409         jmp     .Lsqr4x_inner
1410
1411 .align  32
1412 .Lsqr4x_inner:
1413         movq    (%rsi,%rcx,1),%rbx
1414         mulq    %r15
1415         addq    %rax,%r13
1416         movq    %rbx,%rax
1417         movq    %rdx,%r12
1418         adcq    $0,%r12
1419         addq    (%rdi,%rcx,1),%r13
1420         adcq    $0,%r12
1421
1422 .byte   0x67
1423         mulq    %r14
1424         addq    %rax,%r11
1425         movq    %rbx,%rax
1426         movq    8(%rsi,%rcx,1),%rbx
1427         movq    %rdx,%r10
1428         adcq    $0,%r10
1429         addq    %r13,%r11
1430         adcq    $0,%r10
1431
1432         mulq    %r15
1433         addq    %rax,%r12
1434         movq    %r11,(%rdi,%rcx,1)
1435         movq    %rbx,%rax
1436         movq    %rdx,%r13
1437         adcq    $0,%r13
1438         addq    8(%rdi,%rcx,1),%r12
1439         leaq    16(%rcx),%rcx
1440         adcq    $0,%r13
1441
1442         mulq    %r14
1443         addq    %rax,%r10
1444         movq    %rbx,%rax
1445         adcq    $0,%rdx
1446         addq    %r12,%r10
1447         movq    %rdx,%r11
1448         adcq    $0,%r11
1449         movq    %r10,-8(%rdi,%rcx,1)
1450
1451         cmpq    $0,%rcx
1452         jne     .Lsqr4x_inner
1453
1454 .byte   0x67
1455         mulq    %r15
1456         addq    %rax,%r13
1457         adcq    $0,%rdx
1458         addq    %r11,%r13
1459         adcq    $0,%rdx
1460
1461         movq    %r13,(%rdi)
1462         movq    %rdx,%r12
1463         movq    %rdx,8(%rdi)
1464
1465         addq    $16,%rbp
1466         jnz     .Lsqr4x_outer
1467
1468
1469         movq    -32(%rsi),%r14
1470         leaq    48+8(%rsp,%r9,2),%rdi
1471         movq    -24(%rsi),%rax
1472         leaq    -32(%rdi,%rbp,1),%rdi
1473         movq    -16(%rsi),%rbx
1474         movq    %rax,%r15
1475
1476         mulq    %r14
1477         addq    %rax,%r10
1478         movq    %rbx,%rax
1479         movq    %rdx,%r11
1480         adcq    $0,%r11
1481
1482         mulq    %r14
1483         addq    %rax,%r11
1484         movq    %rbx,%rax
1485         movq    %r10,-24(%rdi)
1486         movq    %rdx,%r10
1487         adcq    $0,%r10
1488         addq    %r13,%r11
1489         movq    -8(%rsi),%rbx
1490         adcq    $0,%r10
1491
1492         mulq    %r15
1493         addq    %rax,%r12
1494         movq    %rbx,%rax
1495         movq    %r11,-16(%rdi)
1496         movq    %rdx,%r13
1497         adcq    $0,%r13
1498
1499         mulq    %r14
1500         addq    %rax,%r10
1501         movq    %rbx,%rax
1502         movq    %rdx,%r11
1503         adcq    $0,%r11
1504         addq    %r12,%r10
1505         adcq    $0,%r11
1506         movq    %r10,-8(%rdi)
1507
1508         mulq    %r15
1509         addq    %rax,%r13
1510         movq    -16(%rsi),%rax
1511         adcq    $0,%rdx
1512         addq    %r11,%r13
1513         adcq    $0,%rdx
1514
1515         movq    %r13,(%rdi)
1516         movq    %rdx,%r12
1517         movq    %rdx,8(%rdi)
1518
1519         mulq    %rbx
1520         addq    $16,%rbp
1521         xorq    %r14,%r14
1522         subq    %r9,%rbp
1523         xorq    %r15,%r15
1524
1525         addq    %r12,%rax
1526         adcq    $0,%rdx
1527         movq    %rax,8(%rdi)
1528         movq    %rdx,16(%rdi)
1529         movq    %r15,24(%rdi)
1530
1531         movq    -16(%rsi,%rbp,1),%rax
1532         leaq    48+8(%rsp),%rdi
1533         xorq    %r10,%r10
1534         movq    8(%rdi),%r11
1535
1536         leaq    (%r14,%r10,2),%r12
1537         shrq    $63,%r10
1538         leaq    (%rcx,%r11,2),%r13
1539         shrq    $63,%r11
1540         orq     %r10,%r13
1541         movq    16(%rdi),%r10
1542         movq    %r11,%r14
1543         mulq    %rax
1544         negq    %r15
1545         movq    24(%rdi),%r11
1546         adcq    %rax,%r12
1547         movq    -8(%rsi,%rbp,1),%rax
1548         movq    %r12,(%rdi)
1549         adcq    %rdx,%r13
1550
1551         leaq    (%r14,%r10,2),%rbx
1552         movq    %r13,8(%rdi)
1553         sbbq    %r15,%r15
1554         shrq    $63,%r10
1555         leaq    (%rcx,%r11,2),%r8
1556         shrq    $63,%r11
1557         orq     %r10,%r8
1558         movq    32(%rdi),%r10
1559         movq    %r11,%r14
1560         mulq    %rax
1561         negq    %r15
1562         movq    40(%rdi),%r11
1563         adcq    %rax,%rbx
1564         movq    0(%rsi,%rbp,1),%rax
1565         movq    %rbx,16(%rdi)
1566         adcq    %rdx,%r8
1567         leaq    16(%rbp),%rbp
1568         movq    %r8,24(%rdi)
1569         sbbq    %r15,%r15
1570         leaq    64(%rdi),%rdi
1571         jmp     .Lsqr4x_shift_n_add
1572
1573 .align  32
1574 .Lsqr4x_shift_n_add:
1575         leaq    (%r14,%r10,2),%r12
1576         shrq    $63,%r10
1577         leaq    (%rcx,%r11,2),%r13
1578         shrq    $63,%r11
1579         orq     %r10,%r13
1580         movq    -16(%rdi),%r10
1581         movq    %r11,%r14
1582         mulq    %rax
1583         negq    %r15
1584         movq    -8(%rdi),%r11
1585         adcq    %rax,%r12
1586         movq    -8(%rsi,%rbp,1),%rax
1587         movq    %r12,-32(%rdi)
1588         adcq    %rdx,%r13
1589
1590         leaq    (%r14,%r10,2),%rbx
1591         movq    %r13,-24(%rdi)
1592         sbbq    %r15,%r15
1593         shrq    $63,%r10
1594         leaq    (%rcx,%r11,2),%r8
1595         shrq    $63,%r11
1596         orq     %r10,%r8
1597         movq    0(%rdi),%r10
1598         movq    %r11,%r14
1599         mulq    %rax
1600         negq    %r15
1601         movq    8(%rdi),%r11
1602         adcq    %rax,%rbx
1603         movq    0(%rsi,%rbp,1),%rax
1604         movq    %rbx,-16(%rdi)
1605         adcq    %rdx,%r8
1606
1607         leaq    (%r14,%r10,2),%r12
1608         movq    %r8,-8(%rdi)
1609         sbbq    %r15,%r15
1610         shrq    $63,%r10
1611         leaq    (%rcx,%r11,2),%r13
1612         shrq    $63,%r11
1613         orq     %r10,%r13
1614         movq    16(%rdi),%r10
1615         movq    %r11,%r14
1616         mulq    %rax
1617         negq    %r15
1618         movq    24(%rdi),%r11
1619         adcq    %rax,%r12
1620         movq    8(%rsi,%rbp,1),%rax
1621         movq    %r12,0(%rdi)
1622         adcq    %rdx,%r13
1623
1624         leaq    (%r14,%r10,2),%rbx
1625         movq    %r13,8(%rdi)
1626         sbbq    %r15,%r15
1627         shrq    $63,%r10
1628         leaq    (%rcx,%r11,2),%r8
1629         shrq    $63,%r11
1630         orq     %r10,%r8
1631         movq    32(%rdi),%r10
1632         movq    %r11,%r14
1633         mulq    %rax
1634         negq    %r15
1635         movq    40(%rdi),%r11
1636         adcq    %rax,%rbx
1637         movq    16(%rsi,%rbp,1),%rax
1638         movq    %rbx,16(%rdi)
1639         adcq    %rdx,%r8
1640         movq    %r8,24(%rdi)
1641         sbbq    %r15,%r15
1642         leaq    64(%rdi),%rdi
1643         addq    $32,%rbp
1644         jnz     .Lsqr4x_shift_n_add
1645
1646         leaq    (%r14,%r10,2),%r12
1647 .byte   0x67
1648         shrq    $63,%r10
1649         leaq    (%rcx,%r11,2),%r13
1650         shrq    $63,%r11
1651         orq     %r10,%r13
1652         movq    -16(%rdi),%r10
1653         movq    %r11,%r14
1654         mulq    %rax
1655         negq    %r15
1656         movq    -8(%rdi),%r11
1657         adcq    %rax,%r12
1658         movq    -8(%rsi),%rax
1659         movq    %r12,-32(%rdi)
1660         adcq    %rdx,%r13
1661
1662         leaq    (%r14,%r10,2),%rbx
1663         movq    %r13,-24(%rdi)
1664         sbbq    %r15,%r15
1665         shrq    $63,%r10
1666         leaq    (%rcx,%r11,2),%r8
1667         shrq    $63,%r11
1668         orq     %r10,%r8
1669         mulq    %rax
1670         negq    %r15
1671         adcq    %rax,%rbx
1672         adcq    %rdx,%r8
1673         movq    %rbx,-16(%rdi)
1674         movq    %r8,-8(%rdi)
1675 .byte   102,72,15,126,213
1676 __bn_sqr8x_reduction:
1677         xorq    %rax,%rax
1678         leaq    (%r9,%rbp,1),%rcx
1679         leaq    48+8(%rsp,%r9,2),%rdx
1680         movq    %rcx,0+8(%rsp)
1681         leaq    48+8(%rsp,%r9,1),%rdi
1682         movq    %rdx,8+8(%rsp)
1683         negq    %r9
1684         jmp     .L8x_reduction_loop
1685
1686 .align  32
1687 .L8x_reduction_loop:
1688         leaq    (%rdi,%r9,1),%rdi
1689 .byte   0x66
1690         movq    0(%rdi),%rbx
1691         movq    8(%rdi),%r9
1692         movq    16(%rdi),%r10
1693         movq    24(%rdi),%r11
1694         movq    32(%rdi),%r12
1695         movq    40(%rdi),%r13
1696         movq    48(%rdi),%r14
1697         movq    56(%rdi),%r15
1698         movq    %rax,(%rdx)
1699         leaq    64(%rdi),%rdi
1700
1701 .byte   0x67
1702         movq    %rbx,%r8
1703         imulq   32+8(%rsp),%rbx
1704         movq    0(%rbp),%rax
1705         movl    $8,%ecx
1706         jmp     .L8x_reduce
1707
1708 .align  32
1709 .L8x_reduce:
1710         mulq    %rbx
1711         movq    8(%rbp),%rax
1712         negq    %r8
1713         movq    %rdx,%r8
1714         adcq    $0,%r8
1715
1716         mulq    %rbx
1717         addq    %rax,%r9
1718         movq    16(%rbp),%rax
1719         adcq    $0,%rdx
1720         addq    %r9,%r8
1721         movq    %rbx,48-8+8(%rsp,%rcx,8)
1722         movq    %rdx,%r9
1723         adcq    $0,%r9
1724
1725         mulq    %rbx
1726         addq    %rax,%r10
1727         movq    24(%rbp),%rax
1728         adcq    $0,%rdx
1729         addq    %r10,%r9
1730         movq    32+8(%rsp),%rsi
1731         movq    %rdx,%r10
1732         adcq    $0,%r10
1733
1734         mulq    %rbx
1735         addq    %rax,%r11
1736         movq    32(%rbp),%rax
1737         adcq    $0,%rdx
1738         imulq   %r8,%rsi
1739         addq    %r11,%r10
1740         movq    %rdx,%r11
1741         adcq    $0,%r11
1742
1743         mulq    %rbx
1744         addq    %rax,%r12
1745         movq    40(%rbp),%rax
1746         adcq    $0,%rdx
1747         addq    %r12,%r11
1748         movq    %rdx,%r12
1749         adcq    $0,%r12
1750
1751         mulq    %rbx
1752         addq    %rax,%r13
1753         movq    48(%rbp),%rax
1754         adcq    $0,%rdx
1755         addq    %r13,%r12
1756         movq    %rdx,%r13
1757         adcq    $0,%r13
1758
1759         mulq    %rbx
1760         addq    %rax,%r14
1761         movq    56(%rbp),%rax
1762         adcq    $0,%rdx
1763         addq    %r14,%r13
1764         movq    %rdx,%r14
1765         adcq    $0,%r14
1766
1767         mulq    %rbx
1768         movq    %rsi,%rbx
1769         addq    %rax,%r15
1770         movq    0(%rbp),%rax
1771         adcq    $0,%rdx
1772         addq    %r15,%r14
1773         movq    %rdx,%r15
1774         adcq    $0,%r15
1775
1776         decl    %ecx
1777         jnz     .L8x_reduce
1778
1779         leaq    64(%rbp),%rbp
1780         xorq    %rax,%rax
1781         movq    8+8(%rsp),%rdx
1782         cmpq    0+8(%rsp),%rbp
1783         jae     .L8x_no_tail
1784
1785 .byte   0x66
1786         addq    0(%rdi),%r8
1787         adcq    8(%rdi),%r9
1788         adcq    16(%rdi),%r10
1789         adcq    24(%rdi),%r11
1790         adcq    32(%rdi),%r12
1791         adcq    40(%rdi),%r13
1792         adcq    48(%rdi),%r14
1793         adcq    56(%rdi),%r15
1794         sbbq    %rsi,%rsi
1795
1796         movq    48+56+8(%rsp),%rbx
1797         movl    $8,%ecx
1798         movq    0(%rbp),%rax
1799         jmp     .L8x_tail
1800
1801 .align  32
1802 .L8x_tail:
1803         mulq    %rbx
1804         addq    %rax,%r8
1805         movq    8(%rbp),%rax
1806         movq    %r8,(%rdi)
1807         movq    %rdx,%r8
1808         adcq    $0,%r8
1809
1810         mulq    %rbx
1811         addq    %rax,%r9
1812         movq    16(%rbp),%rax
1813         adcq    $0,%rdx
1814         addq    %r9,%r8
1815         leaq    8(%rdi),%rdi
1816         movq    %rdx,%r9
1817         adcq    $0,%r9
1818
1819         mulq    %rbx
1820         addq    %rax,%r10
1821         movq    24(%rbp),%rax
1822         adcq    $0,%rdx
1823         addq    %r10,%r9
1824         movq    %rdx,%r10
1825         adcq    $0,%r10
1826
1827         mulq    %rbx
1828         addq    %rax,%r11
1829         movq    32(%rbp),%rax
1830         adcq    $0,%rdx
1831         addq    %r11,%r10
1832         movq    %rdx,%r11
1833         adcq    $0,%r11
1834
1835         mulq    %rbx
1836         addq    %rax,%r12
1837         movq    40(%rbp),%rax
1838         adcq    $0,%rdx
1839         addq    %r12,%r11
1840         movq    %rdx,%r12
1841         adcq    $0,%r12
1842
1843         mulq    %rbx
1844         addq    %rax,%r13
1845         movq    48(%rbp),%rax
1846         adcq    $0,%rdx
1847         addq    %r13,%r12
1848         movq    %rdx,%r13
1849         adcq    $0,%r13
1850
1851         mulq    %rbx
1852         addq    %rax,%r14
1853         movq    56(%rbp),%rax
1854         adcq    $0,%rdx
1855         addq    %r14,%r13
1856         movq    %rdx,%r14
1857         adcq    $0,%r14
1858
1859         mulq    %rbx
1860         movq    48-16+8(%rsp,%rcx,8),%rbx
1861         addq    %rax,%r15
1862         adcq    $0,%rdx
1863         addq    %r15,%r14
1864         movq    0(%rbp),%rax
1865         movq    %rdx,%r15
1866         adcq    $0,%r15
1867
1868         decl    %ecx
1869         jnz     .L8x_tail
1870
1871         leaq    64(%rbp),%rbp
1872         movq    8+8(%rsp),%rdx
1873         cmpq    0+8(%rsp),%rbp
1874         jae     .L8x_tail_done
1875
1876         movq    48+56+8(%rsp),%rbx
1877         negq    %rsi
1878         movq    0(%rbp),%rax
1879         adcq    0(%rdi),%r8
1880         adcq    8(%rdi),%r9
1881         adcq    16(%rdi),%r10
1882         adcq    24(%rdi),%r11
1883         adcq    32(%rdi),%r12
1884         adcq    40(%rdi),%r13
1885         adcq    48(%rdi),%r14
1886         adcq    56(%rdi),%r15
1887         sbbq    %rsi,%rsi
1888
1889         movl    $8,%ecx
1890         jmp     .L8x_tail
1891
1892 .align  32
1893 .L8x_tail_done:
1894         addq    (%rdx),%r8
1895         adcq    $0,%r9
1896         adcq    $0,%r10
1897         adcq    $0,%r11
1898         adcq    $0,%r12
1899         adcq    $0,%r13
1900         adcq    $0,%r14
1901         adcq    $0,%r15
1902
1903
1904         xorq    %rax,%rax
1905
1906         negq    %rsi
1907 .L8x_no_tail:
1908         adcq    0(%rdi),%r8
1909         adcq    8(%rdi),%r9
1910         adcq    16(%rdi),%r10
1911         adcq    24(%rdi),%r11
1912         adcq    32(%rdi),%r12
1913         adcq    40(%rdi),%r13
1914         adcq    48(%rdi),%r14
1915         adcq    56(%rdi),%r15
1916         adcq    $0,%rax
1917         movq    -8(%rbp),%rcx
1918         xorq    %rsi,%rsi
1919
1920 .byte   102,72,15,126,213
1921
1922         movq    %r8,0(%rdi)
1923         movq    %r9,8(%rdi)
1924 .byte   102,73,15,126,217
1925         movq    %r10,16(%rdi)
1926         movq    %r11,24(%rdi)
1927         movq    %r12,32(%rdi)
1928         movq    %r13,40(%rdi)
1929         movq    %r14,48(%rdi)
1930         movq    %r15,56(%rdi)
1931         leaq    64(%rdi),%rdi
1932
1933         cmpq    %rdx,%rdi
1934         jb      .L8x_reduction_loop
1935         .byte   0xf3,0xc3
1936 .size   bn_sqr8x_internal,.-bn_sqr8x_internal
1937 .type   __bn_post4x_internal,@function
1938 .align  32
1939 __bn_post4x_internal:
1940         movq    0(%rbp),%r12
1941         leaq    (%rdi,%r9,1),%rbx
1942         movq    %r9,%rcx
1943 .byte   102,72,15,126,207
1944         negq    %rax
1945 .byte   102,72,15,126,206
1946         sarq    $3+2,%rcx
1947         decq    %r12
1948         xorq    %r10,%r10
1949         movq    8(%rbp),%r13
1950         movq    16(%rbp),%r14
1951         movq    24(%rbp),%r15
1952         jmp     .Lsqr4x_sub_entry
1953
1954 .align  16
1955 .Lsqr4x_sub:
1956         movq    0(%rbp),%r12
1957         movq    8(%rbp),%r13
1958         movq    16(%rbp),%r14
1959         movq    24(%rbp),%r15
1960 .Lsqr4x_sub_entry:
1961         leaq    32(%rbp),%rbp
1962         notq    %r12
1963         notq    %r13
1964         notq    %r14
1965         notq    %r15
1966         andq    %rax,%r12
1967         andq    %rax,%r13
1968         andq    %rax,%r14
1969         andq    %rax,%r15
1970
1971         negq    %r10
1972         adcq    0(%rbx),%r12
1973         adcq    8(%rbx),%r13
1974         adcq    16(%rbx),%r14
1975         adcq    24(%rbx),%r15
1976         movq    %r12,0(%rdi)
1977         leaq    32(%rbx),%rbx
1978         movq    %r13,8(%rdi)
1979         sbbq    %r10,%r10
1980         movq    %r14,16(%rdi)
1981         movq    %r15,24(%rdi)
1982         leaq    32(%rdi),%rdi
1983
1984         incq    %rcx
1985         jnz     .Lsqr4x_sub
1986
1987         movq    %r9,%r10
1988         negq    %r9
1989         .byte   0xf3,0xc3
1990 .size   __bn_post4x_internal,.-__bn_post4x_internal
1991 .globl  bn_from_montgomery
1992 .type   bn_from_montgomery,@function
1993 .align  32
1994 bn_from_montgomery:
1995         testl   $7,%r9d
1996         jz      bn_from_mont8x
1997         xorl    %eax,%eax
1998         .byte   0xf3,0xc3
1999 .size   bn_from_montgomery,.-bn_from_montgomery
2000
2001 .type   bn_from_mont8x,@function
2002 .align  32
2003 bn_from_mont8x:
2004 .byte   0x67
2005         movq    %rsp,%rax
2006         pushq   %rbx
2007         pushq   %rbp
2008         pushq   %r12
2009         pushq   %r13
2010         pushq   %r14
2011         pushq   %r15
2012 .Lfrom_prologue:
2013
2014         shll    $3,%r9d
2015         leaq    (%r9,%r9,2),%r10
2016         negq    %r9
2017         movq    (%r8),%r8
2018
2019
2020
2021
2022
2023
2024
2025
2026         leaq    -320(%rsp,%r9,2),%r11
2027         movq    %rsp,%rbp
2028         subq    %rdi,%r11
2029         andq    $4095,%r11
2030         cmpq    %r11,%r10
2031         jb      .Lfrom_sp_alt
2032         subq    %r11,%rbp
2033         leaq    -320(%rbp,%r9,2),%rbp
2034         jmp     .Lfrom_sp_done
2035
2036 .align  32
2037 .Lfrom_sp_alt:
2038         leaq    4096-320(,%r9,2),%r10
2039         leaq    -320(%rbp,%r9,2),%rbp
2040         subq    %r10,%r11
2041         movq    $0,%r10
2042         cmovcq  %r10,%r11
2043         subq    %r11,%rbp
2044 .Lfrom_sp_done:
2045         andq    $-64,%rbp
2046         movq    %rsp,%r11
2047         subq    %rbp,%r11
2048         andq    $-4096,%r11
2049         leaq    (%r11,%rbp,1),%rsp
2050         movq    (%rsp),%r10
2051         cmpq    %rbp,%rsp
2052         ja      .Lfrom_page_walk
2053         jmp     .Lfrom_page_walk_done
2054
2055 .Lfrom_page_walk:
2056         leaq    -4096(%rsp),%rsp
2057         movq    (%rsp),%r10
2058         cmpq    %rbp,%rsp
2059         ja      .Lfrom_page_walk
2060 .Lfrom_page_walk_done:
2061
2062         movq    %r9,%r10
2063         negq    %r9
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074         movq    %r8,32(%rsp)
2075         movq    %rax,40(%rsp)
2076 .Lfrom_body:
2077         movq    %r9,%r11
2078         leaq    48(%rsp),%rax
2079         pxor    %xmm0,%xmm0
2080         jmp     .Lmul_by_1
2081
2082 .align  32
2083 .Lmul_by_1:
2084         movdqu  (%rsi),%xmm1
2085         movdqu  16(%rsi),%xmm2
2086         movdqu  32(%rsi),%xmm3
2087         movdqa  %xmm0,(%rax,%r9,1)
2088         movdqu  48(%rsi),%xmm4
2089         movdqa  %xmm0,16(%rax,%r9,1)
2090 .byte   0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2091         movdqa  %xmm1,(%rax)
2092         movdqa  %xmm0,32(%rax,%r9,1)
2093         movdqa  %xmm2,16(%rax)
2094         movdqa  %xmm0,48(%rax,%r9,1)
2095         movdqa  %xmm3,32(%rax)
2096         movdqa  %xmm4,48(%rax)
2097         leaq    64(%rax),%rax
2098         subq    $64,%r11
2099         jnz     .Lmul_by_1
2100
2101 .byte   102,72,15,110,207
2102 .byte   102,72,15,110,209
2103 .byte   0x67
2104         movq    %rcx,%rbp
2105 .byte   102,73,15,110,218
2106         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
2107         andl    $0x80108,%r11d
2108         cmpl    $0x80108,%r11d
2109         jne     .Lfrom_mont_nox
2110
2111         leaq    (%rax,%r9,1),%rdi
2112         call    __bn_sqrx8x_reduction
2113         call    __bn_postx4x_internal
2114
2115         pxor    %xmm0,%xmm0
2116         leaq    48(%rsp),%rax
2117         movq    40(%rsp),%rsi
2118         jmp     .Lfrom_mont_zero
2119
2120 .align  32
2121 .Lfrom_mont_nox:
2122         call    __bn_sqr8x_reduction
2123         call    __bn_post4x_internal
2124
2125         pxor    %xmm0,%xmm0
2126         leaq    48(%rsp),%rax
2127         movq    40(%rsp),%rsi
2128         jmp     .Lfrom_mont_zero
2129
2130 .align  32
2131 .Lfrom_mont_zero:
2132         movdqa  %xmm0,0(%rax)
2133         movdqa  %xmm0,16(%rax)
2134         movdqa  %xmm0,32(%rax)
2135         movdqa  %xmm0,48(%rax)
2136         leaq    64(%rax),%rax
2137         subq    $32,%r9
2138         jnz     .Lfrom_mont_zero
2139
2140         movq    $1,%rax
2141         movq    -48(%rsi),%r15
2142         movq    -40(%rsi),%r14
2143         movq    -32(%rsi),%r13
2144         movq    -24(%rsi),%r12
2145         movq    -16(%rsi),%rbp
2146         movq    -8(%rsi),%rbx
2147         leaq    (%rsi),%rsp
2148 .Lfrom_epilogue:
2149         .byte   0xf3,0xc3
2150 .size   bn_from_mont8x,.-bn_from_mont8x
2151 .type   bn_mulx4x_mont_gather5,@function
2152 .align  32
2153 bn_mulx4x_mont_gather5:
2154         movq    %rsp,%rax
2155 .Lmulx4x_enter:
2156         pushq   %rbx
2157         pushq   %rbp
2158         pushq   %r12
2159         pushq   %r13
2160         pushq   %r14
2161         pushq   %r15
2162 .Lmulx4x_prologue:
2163
2164         shll    $3,%r9d
2165         leaq    (%r9,%r9,2),%r10
2166         negq    %r9
2167         movq    (%r8),%r8
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178         leaq    -320(%rsp,%r9,2),%r11
2179         movq    %rsp,%rbp
2180         subq    %rdi,%r11
2181         andq    $4095,%r11
2182         cmpq    %r11,%r10
2183         jb      .Lmulx4xsp_alt
2184         subq    %r11,%rbp
2185         leaq    -320(%rbp,%r9,2),%rbp
2186         jmp     .Lmulx4xsp_done
2187
2188 .Lmulx4xsp_alt:
2189         leaq    4096-320(,%r9,2),%r10
2190         leaq    -320(%rbp,%r9,2),%rbp
2191         subq    %r10,%r11
2192         movq    $0,%r10
2193         cmovcq  %r10,%r11
2194         subq    %r11,%rbp
2195 .Lmulx4xsp_done:
2196         andq    $-64,%rbp
2197         movq    %rsp,%r11
2198         subq    %rbp,%r11
2199         andq    $-4096,%r11
2200         leaq    (%r11,%rbp,1),%rsp
2201         movq    (%rsp),%r10
2202         cmpq    %rbp,%rsp
2203         ja      .Lmulx4x_page_walk
2204         jmp     .Lmulx4x_page_walk_done
2205
2206 .Lmulx4x_page_walk:
2207         leaq    -4096(%rsp),%rsp
2208         movq    (%rsp),%r10
2209         cmpq    %rbp,%rsp
2210         ja      .Lmulx4x_page_walk
2211 .Lmulx4x_page_walk_done:
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225         movq    %r8,32(%rsp)
2226         movq    %rax,40(%rsp)
2227 .Lmulx4x_body:
2228         call    mulx4x_internal
2229
2230         movq    40(%rsp),%rsi
2231         movq    $1,%rax
2232
2233         movq    -48(%rsi),%r15
2234         movq    -40(%rsi),%r14
2235         movq    -32(%rsi),%r13
2236         movq    -24(%rsi),%r12
2237         movq    -16(%rsi),%rbp
2238         movq    -8(%rsi),%rbx
2239         leaq    (%rsi),%rsp
2240 .Lmulx4x_epilogue:
2241         .byte   0xf3,0xc3
2242 .size   bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2243
2244 .type   mulx4x_internal,@function
2245 .align  32
2246 mulx4x_internal:
2247         movq    %r9,8(%rsp)
2248         movq    %r9,%r10
2249         negq    %r9
2250         shlq    $5,%r9
2251         negq    %r10
2252         leaq    128(%rdx,%r9,1),%r13
2253         shrq    $5+5,%r9
2254         movd    8(%rax),%xmm5
2255         subq    $1,%r9
2256         leaq    .Linc(%rip),%rax
2257         movq    %r13,16+8(%rsp)
2258         movq    %r9,24+8(%rsp)
2259         movq    %rdi,56+8(%rsp)
2260         movdqa  0(%rax),%xmm0
2261         movdqa  16(%rax),%xmm1
2262         leaq    88-112(%rsp,%r10,1),%r10
2263         leaq    128(%rdx),%rdi
2264
2265         pshufd  $0,%xmm5,%xmm5
2266         movdqa  %xmm1,%xmm4
2267 .byte   0x67
2268         movdqa  %xmm1,%xmm2
2269 .byte   0x67
2270         paddd   %xmm0,%xmm1
2271         pcmpeqd %xmm5,%xmm0
2272         movdqa  %xmm4,%xmm3
2273         paddd   %xmm1,%xmm2
2274         pcmpeqd %xmm5,%xmm1
2275         movdqa  %xmm0,112(%r10)
2276         movdqa  %xmm4,%xmm0
2277
2278         paddd   %xmm2,%xmm3
2279         pcmpeqd %xmm5,%xmm2
2280         movdqa  %xmm1,128(%r10)
2281         movdqa  %xmm4,%xmm1
2282
2283         paddd   %xmm3,%xmm0
2284         pcmpeqd %xmm5,%xmm3
2285         movdqa  %xmm2,144(%r10)
2286         movdqa  %xmm4,%xmm2
2287
2288         paddd   %xmm0,%xmm1
2289         pcmpeqd %xmm5,%xmm0
2290         movdqa  %xmm3,160(%r10)
2291         movdqa  %xmm4,%xmm3
2292         paddd   %xmm1,%xmm2
2293         pcmpeqd %xmm5,%xmm1
2294         movdqa  %xmm0,176(%r10)
2295         movdqa  %xmm4,%xmm0
2296
2297         paddd   %xmm2,%xmm3
2298         pcmpeqd %xmm5,%xmm2
2299         movdqa  %xmm1,192(%r10)
2300         movdqa  %xmm4,%xmm1
2301
2302         paddd   %xmm3,%xmm0
2303         pcmpeqd %xmm5,%xmm3
2304         movdqa  %xmm2,208(%r10)
2305         movdqa  %xmm4,%xmm2
2306
2307         paddd   %xmm0,%xmm1
2308         pcmpeqd %xmm5,%xmm0
2309         movdqa  %xmm3,224(%r10)
2310         movdqa  %xmm4,%xmm3
2311         paddd   %xmm1,%xmm2
2312         pcmpeqd %xmm5,%xmm1
2313         movdqa  %xmm0,240(%r10)
2314         movdqa  %xmm4,%xmm0
2315
2316         paddd   %xmm2,%xmm3
2317         pcmpeqd %xmm5,%xmm2
2318         movdqa  %xmm1,256(%r10)
2319         movdqa  %xmm4,%xmm1
2320
2321         paddd   %xmm3,%xmm0
2322         pcmpeqd %xmm5,%xmm3
2323         movdqa  %xmm2,272(%r10)
2324         movdqa  %xmm4,%xmm2
2325
2326         paddd   %xmm0,%xmm1
2327         pcmpeqd %xmm5,%xmm0
2328         movdqa  %xmm3,288(%r10)
2329         movdqa  %xmm4,%xmm3
2330 .byte   0x67
2331         paddd   %xmm1,%xmm2
2332         pcmpeqd %xmm5,%xmm1
2333         movdqa  %xmm0,304(%r10)
2334
2335         paddd   %xmm2,%xmm3
2336         pcmpeqd %xmm5,%xmm2
2337         movdqa  %xmm1,320(%r10)
2338
2339         pcmpeqd %xmm5,%xmm3
2340         movdqa  %xmm2,336(%r10)
2341
2342         pand    64(%rdi),%xmm0
2343         pand    80(%rdi),%xmm1
2344         pand    96(%rdi),%xmm2
2345         movdqa  %xmm3,352(%r10)
2346         pand    112(%rdi),%xmm3
2347         por     %xmm2,%xmm0
2348         por     %xmm3,%xmm1
2349         movdqa  -128(%rdi),%xmm4
2350         movdqa  -112(%rdi),%xmm5
2351         movdqa  -96(%rdi),%xmm2
2352         pand    112(%r10),%xmm4
2353         movdqa  -80(%rdi),%xmm3
2354         pand    128(%r10),%xmm5
2355         por     %xmm4,%xmm0
2356         pand    144(%r10),%xmm2
2357         por     %xmm5,%xmm1
2358         pand    160(%r10),%xmm3
2359         por     %xmm2,%xmm0
2360         por     %xmm3,%xmm1
2361         movdqa  -64(%rdi),%xmm4
2362         movdqa  -48(%rdi),%xmm5
2363         movdqa  -32(%rdi),%xmm2
2364         pand    176(%r10),%xmm4
2365         movdqa  -16(%rdi),%xmm3
2366         pand    192(%r10),%xmm5
2367         por     %xmm4,%xmm0
2368         pand    208(%r10),%xmm2
2369         por     %xmm5,%xmm1
2370         pand    224(%r10),%xmm3
2371         por     %xmm2,%xmm0
2372         por     %xmm3,%xmm1
2373         movdqa  0(%rdi),%xmm4
2374         movdqa  16(%rdi),%xmm5
2375         movdqa  32(%rdi),%xmm2
2376         pand    240(%r10),%xmm4
2377         movdqa  48(%rdi),%xmm3
2378         pand    256(%r10),%xmm5
2379         por     %xmm4,%xmm0
2380         pand    272(%r10),%xmm2
2381         por     %xmm5,%xmm1
2382         pand    288(%r10),%xmm3
2383         por     %xmm2,%xmm0
2384         por     %xmm3,%xmm1
2385         pxor    %xmm1,%xmm0
2386         pshufd  $0x4e,%xmm0,%xmm1
2387         por     %xmm1,%xmm0
2388         leaq    256(%rdi),%rdi
2389 .byte   102,72,15,126,194
2390         leaq    64+32+8(%rsp),%rbx
2391
2392         movq    %rdx,%r9
2393         mulxq   0(%rsi),%r8,%rax
2394         mulxq   8(%rsi),%r11,%r12
2395         addq    %rax,%r11
2396         mulxq   16(%rsi),%rax,%r13
2397         adcq    %rax,%r12
2398         adcq    $0,%r13
2399         mulxq   24(%rsi),%rax,%r14
2400
2401         movq    %r8,%r15
2402         imulq   32+8(%rsp),%r8
2403         xorq    %rbp,%rbp
2404         movq    %r8,%rdx
2405
2406         movq    %rdi,8+8(%rsp)
2407
2408         leaq    32(%rsi),%rsi
2409         adcxq   %rax,%r13
2410         adcxq   %rbp,%r14
2411
2412         mulxq   0(%rcx),%rax,%r10
2413         adcxq   %rax,%r15
2414         adoxq   %r11,%r10
2415         mulxq   8(%rcx),%rax,%r11
2416         adcxq   %rax,%r10
2417         adoxq   %r12,%r11
2418         mulxq   16(%rcx),%rax,%r12
2419         movq    24+8(%rsp),%rdi
2420         movq    %r10,-32(%rbx)
2421         adcxq   %rax,%r11
2422         adoxq   %r13,%r12
2423         mulxq   24(%rcx),%rax,%r15
2424         movq    %r9,%rdx
2425         movq    %r11,-24(%rbx)
2426         adcxq   %rax,%r12
2427         adoxq   %rbp,%r15
2428         leaq    32(%rcx),%rcx
2429         movq    %r12,-16(%rbx)
2430         jmp     .Lmulx4x_1st
2431
2432 .align  32
2433 .Lmulx4x_1st:
2434         adcxq   %rbp,%r15
2435         mulxq   0(%rsi),%r10,%rax
2436         adcxq   %r14,%r10
2437         mulxq   8(%rsi),%r11,%r14
2438         adcxq   %rax,%r11
2439         mulxq   16(%rsi),%r12,%rax
2440         adcxq   %r14,%r12
2441         mulxq   24(%rsi),%r13,%r14
2442 .byte   0x67,0x67
2443         movq    %r8,%rdx
2444         adcxq   %rax,%r13
2445         adcxq   %rbp,%r14
2446         leaq    32(%rsi),%rsi
2447         leaq    32(%rbx),%rbx
2448
2449         adoxq   %r15,%r10
2450         mulxq   0(%rcx),%rax,%r15
2451         adcxq   %rax,%r10
2452         adoxq   %r15,%r11
2453         mulxq   8(%rcx),%rax,%r15
2454         adcxq   %rax,%r11
2455         adoxq   %r15,%r12
2456         mulxq   16(%rcx),%rax,%r15
2457         movq    %r10,-40(%rbx)
2458         adcxq   %rax,%r12
2459         movq    %r11,-32(%rbx)
2460         adoxq   %r15,%r13
2461         mulxq   24(%rcx),%rax,%r15
2462         movq    %r9,%rdx
2463         movq    %r12,-24(%rbx)
2464         adcxq   %rax,%r13
2465         adoxq   %rbp,%r15
2466         leaq    32(%rcx),%rcx
2467         movq    %r13,-16(%rbx)
2468
2469         decq    %rdi
2470         jnz     .Lmulx4x_1st
2471
2472         movq    8(%rsp),%rax
2473         adcq    %rbp,%r15
2474         leaq    (%rsi,%rax,1),%rsi
2475         addq    %r15,%r14
2476         movq    8+8(%rsp),%rdi
2477         adcq    %rbp,%rbp
2478         movq    %r14,-8(%rbx)
2479         jmp     .Lmulx4x_outer
2480
2481 .align  32
2482 .Lmulx4x_outer:
2483         leaq    16-256(%rbx),%r10
2484         pxor    %xmm4,%xmm4
2485 .byte   0x67,0x67
2486         pxor    %xmm5,%xmm5
2487         movdqa  -128(%rdi),%xmm0
2488         movdqa  -112(%rdi),%xmm1
2489         movdqa  -96(%rdi),%xmm2
2490         pand    256(%r10),%xmm0
2491         movdqa  -80(%rdi),%xmm3
2492         pand    272(%r10),%xmm1
2493         por     %xmm0,%xmm4
2494         pand    288(%r10),%xmm2
2495         por     %xmm1,%xmm5
2496         pand    304(%r10),%xmm3
2497         por     %xmm2,%xmm4
2498         por     %xmm3,%xmm5
2499         movdqa  -64(%rdi),%xmm0
2500         movdqa  -48(%rdi),%xmm1
2501         movdqa  -32(%rdi),%xmm2
2502         pand    320(%r10),%xmm0
2503         movdqa  -16(%rdi),%xmm3
2504         pand    336(%r10),%xmm1
2505         por     %xmm0,%xmm4
2506         pand    352(%r10),%xmm2
2507         por     %xmm1,%xmm5
2508         pand    368(%r10),%xmm3
2509         por     %xmm2,%xmm4
2510         por     %xmm3,%xmm5
2511         movdqa  0(%rdi),%xmm0
2512         movdqa  16(%rdi),%xmm1
2513         movdqa  32(%rdi),%xmm2
2514         pand    384(%r10),%xmm0
2515         movdqa  48(%rdi),%xmm3
2516         pand    400(%r10),%xmm1
2517         por     %xmm0,%xmm4
2518         pand    416(%r10),%xmm2
2519         por     %xmm1,%xmm5
2520         pand    432(%r10),%xmm3
2521         por     %xmm2,%xmm4
2522         por     %xmm3,%xmm5
2523         movdqa  64(%rdi),%xmm0
2524         movdqa  80(%rdi),%xmm1
2525         movdqa  96(%rdi),%xmm2
2526         pand    448(%r10),%xmm0
2527         movdqa  112(%rdi),%xmm3
2528         pand    464(%r10),%xmm1
2529         por     %xmm0,%xmm4
2530         pand    480(%r10),%xmm2
2531         por     %xmm1,%xmm5
2532         pand    496(%r10),%xmm3
2533         por     %xmm2,%xmm4
2534         por     %xmm3,%xmm5
2535         por     %xmm5,%xmm4
2536         pshufd  $0x4e,%xmm4,%xmm0
2537         por     %xmm4,%xmm0
2538         leaq    256(%rdi),%rdi
2539 .byte   102,72,15,126,194
2540
2541         movq    %rbp,(%rbx)
2542         leaq    32(%rbx,%rax,1),%rbx
2543         mulxq   0(%rsi),%r8,%r11
2544         xorq    %rbp,%rbp
2545         movq    %rdx,%r9
2546         mulxq   8(%rsi),%r14,%r12
2547         adoxq   -32(%rbx),%r8
2548         adcxq   %r14,%r11
2549         mulxq   16(%rsi),%r15,%r13
2550         adoxq   -24(%rbx),%r11
2551         adcxq   %r15,%r12
2552         mulxq   24(%rsi),%rdx,%r14
2553         adoxq   -16(%rbx),%r12
2554         adcxq   %rdx,%r13
2555         leaq    (%rcx,%rax,1),%rcx
2556         leaq    32(%rsi),%rsi
2557         adoxq   -8(%rbx),%r13
2558         adcxq   %rbp,%r14
2559         adoxq   %rbp,%r14
2560
2561         movq    %r8,%r15
2562         imulq   32+8(%rsp),%r8
2563
2564         movq    %r8,%rdx
2565         xorq    %rbp,%rbp
2566         movq    %rdi,8+8(%rsp)
2567
2568         mulxq   0(%rcx),%rax,%r10
2569         adcxq   %rax,%r15
2570         adoxq   %r11,%r10
2571         mulxq   8(%rcx),%rax,%r11
2572         adcxq   %rax,%r10
2573         adoxq   %r12,%r11
2574         mulxq   16(%rcx),%rax,%r12
2575         adcxq   %rax,%r11
2576         adoxq   %r13,%r12
2577         mulxq   24(%rcx),%rax,%r15
2578         movq    %r9,%rdx
2579         movq    24+8(%rsp),%rdi
2580         movq    %r10,-32(%rbx)
2581         adcxq   %rax,%r12
2582         movq    %r11,-24(%rbx)
2583         adoxq   %rbp,%r15
2584         movq    %r12,-16(%rbx)
2585         leaq    32(%rcx),%rcx
2586         jmp     .Lmulx4x_inner
2587
2588 .align  32
2589 .Lmulx4x_inner:
2590         mulxq   0(%rsi),%r10,%rax
2591         adcxq   %rbp,%r15
2592         adoxq   %r14,%r10
2593         mulxq   8(%rsi),%r11,%r14
2594         adcxq   0(%rbx),%r10
2595         adoxq   %rax,%r11
2596         mulxq   16(%rsi),%r12,%rax
2597         adcxq   8(%rbx),%r11
2598         adoxq   %r14,%r12
2599         mulxq   24(%rsi),%r13,%r14
2600         movq    %r8,%rdx
2601         adcxq   16(%rbx),%r12
2602         adoxq   %rax,%r13
2603         adcxq   24(%rbx),%r13
2604         adoxq   %rbp,%r14
2605         leaq    32(%rsi),%rsi
2606         leaq    32(%rbx),%rbx
2607         adcxq   %rbp,%r14
2608
2609         adoxq   %r15,%r10
2610         mulxq   0(%rcx),%rax,%r15
2611         adcxq   %rax,%r10
2612         adoxq   %r15,%r11
2613         mulxq   8(%rcx),%rax,%r15
2614         adcxq   %rax,%r11
2615         adoxq   %r15,%r12
2616         mulxq   16(%rcx),%rax,%r15
2617         movq    %r10,-40(%rbx)
2618         adcxq   %rax,%r12
2619         adoxq   %r15,%r13
2620         movq    %r11,-32(%rbx)
2621         mulxq   24(%rcx),%rax,%r15
2622         movq    %r9,%rdx
2623         leaq    32(%rcx),%rcx
2624         movq    %r12,-24(%rbx)
2625         adcxq   %rax,%r13
2626         adoxq   %rbp,%r15
2627         movq    %r13,-16(%rbx)
2628
2629         decq    %rdi
2630         jnz     .Lmulx4x_inner
2631
2632         movq    0+8(%rsp),%rax
2633         adcq    %rbp,%r15
2634         subq    0(%rbx),%rdi
2635         movq    8+8(%rsp),%rdi
2636         movq    16+8(%rsp),%r10
2637         adcq    %r15,%r14
2638         leaq    (%rsi,%rax,1),%rsi
2639         adcq    %rbp,%rbp
2640         movq    %r14,-8(%rbx)
2641
2642         cmpq    %r10,%rdi
2643         jb      .Lmulx4x_outer
2644
2645         movq    -8(%rcx),%r10
2646         movq    %rbp,%r8
2647         movq    (%rcx,%rax,1),%r12
2648         leaq    (%rcx,%rax,1),%rbp
2649         movq    %rax,%rcx
2650         leaq    (%rbx,%rax,1),%rdi
2651         xorl    %eax,%eax
2652         xorq    %r15,%r15
2653         subq    %r14,%r10
2654         adcq    %r15,%r15
2655         orq     %r15,%r8
2656         sarq    $3+2,%rcx
2657         subq    %r8,%rax
2658         movq    56+8(%rsp),%rdx
2659         decq    %r12
2660         movq    8(%rbp),%r13
2661         xorq    %r8,%r8
2662         movq    16(%rbp),%r14
2663         movq    24(%rbp),%r15
2664         jmp     .Lsqrx4x_sub_entry
2665 .size   mulx4x_internal,.-mulx4x_internal
2666 .type   bn_powerx5,@function
2667 .align  32
2668 bn_powerx5:
2669         movq    %rsp,%rax
2670 .Lpowerx5_enter:
2671         pushq   %rbx
2672         pushq   %rbp
2673         pushq   %r12
2674         pushq   %r13
2675         pushq   %r14
2676         pushq   %r15
2677 .Lpowerx5_prologue:
2678
2679         shll    $3,%r9d
2680         leaq    (%r9,%r9,2),%r10
2681         negq    %r9
2682         movq    (%r8),%r8
2683
2684
2685
2686
2687
2688
2689
2690
2691         leaq    -320(%rsp,%r9,2),%r11
2692         movq    %rsp,%rbp
2693         subq    %rdi,%r11
2694         andq    $4095,%r11
2695         cmpq    %r11,%r10
2696         jb      .Lpwrx_sp_alt
2697         subq    %r11,%rbp
2698         leaq    -320(%rbp,%r9,2),%rbp
2699         jmp     .Lpwrx_sp_done
2700
2701 .align  32
2702 .Lpwrx_sp_alt:
2703         leaq    4096-320(,%r9,2),%r10
2704         leaq    -320(%rbp,%r9,2),%rbp
2705         subq    %r10,%r11
2706         movq    $0,%r10
2707         cmovcq  %r10,%r11
2708         subq    %r11,%rbp
2709 .Lpwrx_sp_done:
2710         andq    $-64,%rbp
2711         movq    %rsp,%r11
2712         subq    %rbp,%r11
2713         andq    $-4096,%r11
2714         leaq    (%r11,%rbp,1),%rsp
2715         movq    (%rsp),%r10
2716         cmpq    %rbp,%rsp
2717         ja      .Lpwrx_page_walk
2718         jmp     .Lpwrx_page_walk_done
2719
2720 .Lpwrx_page_walk:
2721         leaq    -4096(%rsp),%rsp
2722         movq    (%rsp),%r10
2723         cmpq    %rbp,%rsp
2724         ja      .Lpwrx_page_walk
2725 .Lpwrx_page_walk_done:
2726
2727         movq    %r9,%r10
2728         negq    %r9
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741         pxor    %xmm0,%xmm0
2742 .byte   102,72,15,110,207
2743 .byte   102,72,15,110,209
2744 .byte   102,73,15,110,218
2745 .byte   102,72,15,110,226
2746         movq    %r8,32(%rsp)
2747         movq    %rax,40(%rsp)
2748 .Lpowerx5_body:
2749
2750         call    __bn_sqrx8x_internal
2751         call    __bn_postx4x_internal
2752         call    __bn_sqrx8x_internal
2753         call    __bn_postx4x_internal
2754         call    __bn_sqrx8x_internal
2755         call    __bn_postx4x_internal
2756         call    __bn_sqrx8x_internal
2757         call    __bn_postx4x_internal
2758         call    __bn_sqrx8x_internal
2759         call    __bn_postx4x_internal
2760
2761         movq    %r10,%r9
2762         movq    %rsi,%rdi
2763 .byte   102,72,15,126,209
2764 .byte   102,72,15,126,226
2765         movq    40(%rsp),%rax
2766
2767         call    mulx4x_internal
2768
2769         movq    40(%rsp),%rsi
2770         movq    $1,%rax
2771
2772         movq    -48(%rsi),%r15
2773         movq    -40(%rsi),%r14
2774         movq    -32(%rsi),%r13
2775         movq    -24(%rsi),%r12
2776         movq    -16(%rsi),%rbp
2777         movq    -8(%rsi),%rbx
2778         leaq    (%rsi),%rsp
2779 .Lpowerx5_epilogue:
2780         .byte   0xf3,0xc3
2781 .size   bn_powerx5,.-bn_powerx5
2782
2783 .globl  bn_sqrx8x_internal
2784 .hidden bn_sqrx8x_internal
2785 .type   bn_sqrx8x_internal,@function
2786 .align  32
2787 bn_sqrx8x_internal:
2788 __bn_sqrx8x_internal:
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829         leaq    48+8(%rsp),%rdi
2830         leaq    (%rsi,%r9,1),%rbp
2831         movq    %r9,0+8(%rsp)
2832         movq    %rbp,8+8(%rsp)
2833         jmp     .Lsqr8x_zero_start
2834
2835 .align  32
2836 .byte   0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2837 .Lsqrx8x_zero:
2838 .byte   0x3e
2839         movdqa  %xmm0,0(%rdi)
2840         movdqa  %xmm0,16(%rdi)
2841         movdqa  %xmm0,32(%rdi)
2842         movdqa  %xmm0,48(%rdi)
2843 .Lsqr8x_zero_start:
2844         movdqa  %xmm0,64(%rdi)
2845         movdqa  %xmm0,80(%rdi)
2846         movdqa  %xmm0,96(%rdi)
2847         movdqa  %xmm0,112(%rdi)
2848         leaq    128(%rdi),%rdi
2849         subq    $64,%r9
2850         jnz     .Lsqrx8x_zero
2851
2852         movq    0(%rsi),%rdx
2853
2854         xorq    %r10,%r10
2855         xorq    %r11,%r11
2856         xorq    %r12,%r12
2857         xorq    %r13,%r13
2858         xorq    %r14,%r14
2859         xorq    %r15,%r15
2860         leaq    48+8(%rsp),%rdi
2861         xorq    %rbp,%rbp
2862         jmp     .Lsqrx8x_outer_loop
2863
2864 .align  32
2865 .Lsqrx8x_outer_loop:
2866         mulxq   8(%rsi),%r8,%rax
2867         adcxq   %r9,%r8
2868         adoxq   %rax,%r10
2869         mulxq   16(%rsi),%r9,%rax
2870         adcxq   %r10,%r9
2871         adoxq   %rax,%r11
2872 .byte   0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2873         adcxq   %r11,%r10
2874         adoxq   %rax,%r12
2875 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2876         adcxq   %r12,%r11
2877         adoxq   %rax,%r13
2878         mulxq   40(%rsi),%r12,%rax
2879         adcxq   %r13,%r12
2880         adoxq   %rax,%r14
2881         mulxq   48(%rsi),%r13,%rax
2882         adcxq   %r14,%r13
2883         adoxq   %r15,%rax
2884         mulxq   56(%rsi),%r14,%r15
2885         movq    8(%rsi),%rdx
2886         adcxq   %rax,%r14
2887         adoxq   %rbp,%r15
2888         adcq    64(%rdi),%r15
2889         movq    %r8,8(%rdi)
2890         movq    %r9,16(%rdi)
2891         sbbq    %rcx,%rcx
2892         xorq    %rbp,%rbp
2893
2894
2895         mulxq   16(%rsi),%r8,%rbx
2896         mulxq   24(%rsi),%r9,%rax
2897         adcxq   %r10,%r8
2898         adoxq   %rbx,%r9
2899         mulxq   32(%rsi),%r10,%rbx
2900         adcxq   %r11,%r9
2901         adoxq   %rax,%r10
2902 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2903         adcxq   %r12,%r10
2904         adoxq   %rbx,%r11
2905 .byte   0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2906         adcxq   %r13,%r11
2907         adoxq   %r14,%r12
2908 .byte   0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2909         movq    16(%rsi),%rdx
2910         adcxq   %rax,%r12
2911         adoxq   %rbx,%r13
2912         adcxq   %r15,%r13
2913         adoxq   %rbp,%r14
2914         adcxq   %rbp,%r14
2915
2916         movq    %r8,24(%rdi)
2917         movq    %r9,32(%rdi)
2918
2919         mulxq   24(%rsi),%r8,%rbx
2920         mulxq   32(%rsi),%r9,%rax
2921         adcxq   %r10,%r8
2922         adoxq   %rbx,%r9
2923         mulxq   40(%rsi),%r10,%rbx
2924         adcxq   %r11,%r9
2925         adoxq   %rax,%r10
2926 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2927         adcxq   %r12,%r10
2928         adoxq   %r13,%r11
2929 .byte   0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2930 .byte   0x3e
2931         movq    24(%rsi),%rdx
2932         adcxq   %rbx,%r11
2933         adoxq   %rax,%r12
2934         adcxq   %r14,%r12
2935         movq    %r8,40(%rdi)
2936         movq    %r9,48(%rdi)
2937         mulxq   32(%rsi),%r8,%rax
2938         adoxq   %rbp,%r13
2939         adcxq   %rbp,%r13
2940
2941         mulxq   40(%rsi),%r9,%rbx
2942         adcxq   %r10,%r8
2943         adoxq   %rax,%r9
2944         mulxq   48(%rsi),%r10,%rax
2945         adcxq   %r11,%r9
2946         adoxq   %r12,%r10
2947         mulxq   56(%rsi),%r11,%r12
2948         movq    32(%rsi),%rdx
2949         movq    40(%rsi),%r14
2950         adcxq   %rbx,%r10
2951         adoxq   %rax,%r11
2952         movq    48(%rsi),%r15
2953         adcxq   %r13,%r11
2954         adoxq   %rbp,%r12
2955         adcxq   %rbp,%r12
2956
2957         movq    %r8,56(%rdi)
2958         movq    %r9,64(%rdi)
2959
2960         mulxq   %r14,%r9,%rax
2961         movq    56(%rsi),%r8
2962         adcxq   %r10,%r9
2963         mulxq   %r15,%r10,%rbx
2964         adoxq   %rax,%r10
2965         adcxq   %r11,%r10
2966         mulxq   %r8,%r11,%rax
2967         movq    %r14,%rdx
2968         adoxq   %rbx,%r11
2969         adcxq   %r12,%r11
2970
2971         adcxq   %rbp,%rax
2972
2973         mulxq   %r15,%r14,%rbx
2974         mulxq   %r8,%r12,%r13
2975         movq    %r15,%rdx
2976         leaq    64(%rsi),%rsi
2977         adcxq   %r14,%r11
2978         adoxq   %rbx,%r12
2979         adcxq   %rax,%r12
2980         adoxq   %rbp,%r13
2981
2982 .byte   0x67,0x67
2983         mulxq   %r8,%r8,%r14
2984         adcxq   %r8,%r13
2985         adcxq   %rbp,%r14
2986
2987         cmpq    8+8(%rsp),%rsi
2988         je      .Lsqrx8x_outer_break
2989
2990         negq    %rcx
2991         movq    $-8,%rcx
2992         movq    %rbp,%r15
2993         movq    64(%rdi),%r8
2994         adcxq   72(%rdi),%r9
2995         adcxq   80(%rdi),%r10
2996         adcxq   88(%rdi),%r11
2997         adcq    96(%rdi),%r12
2998         adcq    104(%rdi),%r13
2999         adcq    112(%rdi),%r14
3000         adcq    120(%rdi),%r15
3001         leaq    (%rsi),%rbp
3002         leaq    128(%rdi),%rdi
3003         sbbq    %rax,%rax
3004
3005         movq    -64(%rsi),%rdx
3006         movq    %rax,16+8(%rsp)
3007         movq    %rdi,24+8(%rsp)
3008
3009
3010         xorl    %eax,%eax
3011         jmp     .Lsqrx8x_loop
3012
3013 .align  32
3014 .Lsqrx8x_loop:
3015         movq    %r8,%rbx
3016         mulxq   0(%rbp),%rax,%r8
3017         adcxq   %rax,%rbx
3018         adoxq   %r9,%r8
3019
3020         mulxq   8(%rbp),%rax,%r9
3021         adcxq   %rax,%r8
3022         adoxq   %r10,%r9
3023
3024         mulxq   16(%rbp),%rax,%r10
3025         adcxq   %rax,%r9
3026         adoxq   %r11,%r10
3027
3028         mulxq   24(%rbp),%rax,%r11
3029         adcxq   %rax,%r10
3030         adoxq   %r12,%r11
3031
3032 .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3033         adcxq   %rax,%r11
3034         adoxq   %r13,%r12
3035
3036         mulxq   40(%rbp),%rax,%r13
3037         adcxq   %rax,%r12
3038         adoxq   %r14,%r13
3039
3040         mulxq   48(%rbp),%rax,%r14
3041         movq    %rbx,(%rdi,%rcx,8)
3042         movl    $0,%ebx
3043         adcxq   %rax,%r13
3044         adoxq   %r15,%r14
3045
3046 .byte   0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3047         movq    8(%rsi,%rcx,8),%rdx
3048         adcxq   %rax,%r14
3049         adoxq   %rbx,%r15
3050         adcxq   %rbx,%r15
3051
3052 .byte   0x67
3053         incq    %rcx
3054         jnz     .Lsqrx8x_loop
3055
3056         leaq    64(%rbp),%rbp
3057         movq    $-8,%rcx
3058         cmpq    8+8(%rsp),%rbp
3059         je      .Lsqrx8x_break
3060
3061         subq    16+8(%rsp),%rbx
3062 .byte   0x66
3063         movq    -64(%rsi),%rdx
3064         adcxq   0(%rdi),%r8
3065         adcxq   8(%rdi),%r9
3066         adcq    16(%rdi),%r10
3067         adcq    24(%rdi),%r11
3068         adcq    32(%rdi),%r12
3069         adcq    40(%rdi),%r13
3070         adcq    48(%rdi),%r14
3071         adcq    56(%rdi),%r15
3072         leaq    64(%rdi),%rdi
3073 .byte   0x67
3074         sbbq    %rax,%rax
3075         xorl    %ebx,%ebx
3076         movq    %rax,16+8(%rsp)
3077         jmp     .Lsqrx8x_loop
3078
3079 .align  32
3080 .Lsqrx8x_break:
3081         subq    16+8(%rsp),%r8
3082         movq    24+8(%rsp),%rcx
3083         movq    0(%rsi),%rdx
3084         xorl    %ebp,%ebp
3085         movq    %r8,0(%rdi)
3086         cmpq    %rcx,%rdi
3087         je      .Lsqrx8x_outer_loop
3088
3089         movq    %r9,8(%rdi)
3090         movq    8(%rcx),%r9
3091         movq    %r10,16(%rdi)
3092         movq    16(%rcx),%r10
3093         movq    %r11,24(%rdi)
3094         movq    24(%rcx),%r11
3095         movq    %r12,32(%rdi)
3096         movq    32(%rcx),%r12
3097         movq    %r13,40(%rdi)
3098         movq    40(%rcx),%r13
3099         movq    %r14,48(%rdi)
3100         movq    48(%rcx),%r14
3101         movq    %r15,56(%rdi)
3102         movq    56(%rcx),%r15
3103         movq    %rcx,%rdi
3104         jmp     .Lsqrx8x_outer_loop
3105
3106 .align  32
3107 .Lsqrx8x_outer_break:
3108         movq    %r9,72(%rdi)
3109 .byte   102,72,15,126,217
3110         movq    %r10,80(%rdi)
3111         movq    %r11,88(%rdi)
3112         movq    %r12,96(%rdi)
3113         movq    %r13,104(%rdi)
3114         movq    %r14,112(%rdi)
3115         leaq    48+8(%rsp),%rdi
3116         movq    (%rsi,%rcx,1),%rdx
3117
3118         movq    8(%rdi),%r11
3119         xorq    %r10,%r10
3120         movq    0+8(%rsp),%r9
3121         adoxq   %r11,%r11
3122         movq    16(%rdi),%r12
3123         movq    24(%rdi),%r13
3124
3125
3126 .align  32
3127 .Lsqrx4x_shift_n_add:
3128         mulxq   %rdx,%rax,%rbx
3129         adoxq   %r12,%r12
3130         adcxq   %r10,%rax
3131 .byte   0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3132 .byte   0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3133         adoxq   %r13,%r13
3134         adcxq   %r11,%rbx
3135         movq    40(%rdi),%r11
3136         movq    %rax,0(%rdi)
3137         movq    %rbx,8(%rdi)
3138
3139         mulxq   %rdx,%rax,%rbx
3140         adoxq   %r10,%r10
3141         adcxq   %r12,%rax
3142         movq    16(%rsi,%rcx,1),%rdx
3143         movq    48(%rdi),%r12
3144         adoxq   %r11,%r11
3145         adcxq   %r13,%rbx
3146         movq    56(%rdi),%r13
3147         movq    %rax,16(%rdi)
3148         movq    %rbx,24(%rdi)
3149
3150         mulxq   %rdx,%rax,%rbx
3151         adoxq   %r12,%r12
3152         adcxq   %r10,%rax
3153         movq    24(%rsi,%rcx,1),%rdx
3154         leaq    32(%rcx),%rcx
3155         movq    64(%rdi),%r10
3156         adoxq   %r13,%r13
3157         adcxq   %r11,%rbx
3158         movq    72(%rdi),%r11
3159         movq    %rax,32(%rdi)
3160         movq    %rbx,40(%rdi)
3161
3162         mulxq   %rdx,%rax,%rbx
3163         adoxq   %r10,%r10
3164         adcxq   %r12,%rax
3165         jrcxz   .Lsqrx4x_shift_n_add_break
3166 .byte   0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3167         adoxq   %r11,%r11
3168         adcxq   %r13,%rbx
3169         movq    80(%rdi),%r12
3170         movq    88(%rdi),%r13
3171         movq    %rax,48(%rdi)
3172         movq    %rbx,56(%rdi)
3173         leaq    64(%rdi),%rdi
3174         nop
3175         jmp     .Lsqrx4x_shift_n_add
3176
3177 .align  32
3178 .Lsqrx4x_shift_n_add_break:
3179         adcxq   %r13,%rbx
3180         movq    %rax,48(%rdi)
3181         movq    %rbx,56(%rdi)
3182         leaq    64(%rdi),%rdi
3183 .byte   102,72,15,126,213
3184 __bn_sqrx8x_reduction:
3185         xorl    %eax,%eax
3186         movq    32+8(%rsp),%rbx
3187         movq    48+8(%rsp),%rdx
3188         leaq    -64(%rbp,%r9,1),%rcx
3189
3190         movq    %rcx,0+8(%rsp)
3191         movq    %rdi,8+8(%rsp)
3192
3193         leaq    48+8(%rsp),%rdi
3194         jmp     .Lsqrx8x_reduction_loop
3195
3196 .align  32
3197 .Lsqrx8x_reduction_loop:
3198         movq    8(%rdi),%r9
3199         movq    16(%rdi),%r10
3200         movq    24(%rdi),%r11
3201         movq    32(%rdi),%r12
3202         movq    %rdx,%r8
3203         imulq   %rbx,%rdx
3204         movq    40(%rdi),%r13
3205         movq    48(%rdi),%r14
3206         movq    56(%rdi),%r15
3207         movq    %rax,24+8(%rsp)
3208
3209         leaq    64(%rdi),%rdi
3210         xorq    %rsi,%rsi
3211         movq    $-8,%rcx
3212         jmp     .Lsqrx8x_reduce
3213
3214 .align  32
3215 .Lsqrx8x_reduce:
3216         movq    %r8,%rbx
3217         mulxq   0(%rbp),%rax,%r8
3218         adcxq   %rbx,%rax
3219         adoxq   %r9,%r8
3220
3221         mulxq   8(%rbp),%rbx,%r9
3222         adcxq   %rbx,%r8
3223         adoxq   %r10,%r9
3224
3225         mulxq   16(%rbp),%rbx,%r10
3226         adcxq   %rbx,%r9
3227         adoxq   %r11,%r10
3228
3229         mulxq   24(%rbp),%rbx,%r11
3230         adcxq   %rbx,%r10
3231         adoxq   %r12,%r11
3232
3233 .byte   0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3234         movq    %rdx,%rax
3235         movq    %r8,%rdx
3236         adcxq   %rbx,%r11
3237         adoxq   %r13,%r12
3238
3239         mulxq   32+8(%rsp),%rbx,%rdx
3240         movq    %rax,%rdx
3241         movq    %rax,64+48+8(%rsp,%rcx,8)
3242
3243         mulxq   40(%rbp),%rax,%r13
3244         adcxq   %rax,%r12
3245         adoxq   %r14,%r13
3246
3247         mulxq   48(%rbp),%rax,%r14
3248         adcxq   %rax,%r13
3249         adoxq   %r15,%r14
3250
3251         mulxq   56(%rbp),%rax,%r15
3252         movq    %rbx,%rdx
3253         adcxq   %rax,%r14
3254         adoxq   %rsi,%r15
3255         adcxq   %rsi,%r15
3256
3257 .byte   0x67,0x67,0x67
3258         incq    %rcx
3259         jnz     .Lsqrx8x_reduce
3260
3261         movq    %rsi,%rax
3262         cmpq    0+8(%rsp),%rbp
3263         jae     .Lsqrx8x_no_tail
3264
3265         movq    48+8(%rsp),%rdx
3266         addq    0(%rdi),%r8
3267         leaq    64(%rbp),%rbp
3268         movq    $-8,%rcx
3269         adcxq   8(%rdi),%r9
3270         adcxq   16(%rdi),%r10
3271         adcq    24(%rdi),%r11
3272         adcq    32(%rdi),%r12
3273         adcq    40(%rdi),%r13
3274         adcq    48(%rdi),%r14
3275         adcq    56(%rdi),%r15
3276         leaq    64(%rdi),%rdi
3277         sbbq    %rax,%rax
3278
3279         xorq    %rsi,%rsi
3280         movq    %rax,16+8(%rsp)
3281         jmp     .Lsqrx8x_tail
3282
3283 .align  32
3284 .Lsqrx8x_tail:
3285         movq    %r8,%rbx
3286         mulxq   0(%rbp),%rax,%r8
3287         adcxq   %rax,%rbx
3288         adoxq   %r9,%r8
3289
3290         mulxq   8(%rbp),%rax,%r9
3291         adcxq   %rax,%r8
3292         adoxq   %r10,%r9
3293
3294         mulxq   16(%rbp),%rax,%r10
3295         adcxq   %rax,%r9
3296         adoxq   %r11,%r10
3297
3298         mulxq   24(%rbp),%rax,%r11
3299         adcxq   %rax,%r10
3300         adoxq   %r12,%r11
3301
3302 .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3303         adcxq   %rax,%r11
3304         adoxq   %r13,%r12
3305
3306         mulxq   40(%rbp),%rax,%r13
3307         adcxq   %rax,%r12
3308         adoxq   %r14,%r13
3309
3310         mulxq   48(%rbp),%rax,%r14
3311         adcxq   %rax,%r13
3312         adoxq   %r15,%r14
3313
3314         mulxq   56(%rbp),%rax,%r15
3315         movq    72+48+8(%rsp,%rcx,8),%rdx
3316         adcxq   %rax,%r14
3317         adoxq   %rsi,%r15
3318         movq    %rbx,(%rdi,%rcx,8)
3319         movq    %r8,%rbx
3320         adcxq   %rsi,%r15
3321
3322         incq    %rcx
3323         jnz     .Lsqrx8x_tail
3324
3325         cmpq    0+8(%rsp),%rbp
3326         jae     .Lsqrx8x_tail_done
3327
3328         subq    16+8(%rsp),%rsi
3329         movq    48+8(%rsp),%rdx
3330         leaq    64(%rbp),%rbp
3331         adcq    0(%rdi),%r8
3332         adcq    8(%rdi),%r9
3333         adcq    16(%rdi),%r10
3334         adcq    24(%rdi),%r11
3335         adcq    32(%rdi),%r12
3336         adcq    40(%rdi),%r13
3337         adcq    48(%rdi),%r14
3338         adcq    56(%rdi),%r15
3339         leaq    64(%rdi),%rdi
3340         sbbq    %rax,%rax
3341         subq    $8,%rcx
3342
3343         xorq    %rsi,%rsi
3344         movq    %rax,16+8(%rsp)
3345         jmp     .Lsqrx8x_tail
3346
3347 .align  32
3348 .Lsqrx8x_tail_done:
3349         addq    24+8(%rsp),%r8
3350         adcq    $0,%r9
3351         adcq    $0,%r10
3352         adcq    $0,%r11
3353         adcq    $0,%r12
3354         adcq    $0,%r13
3355         adcq    $0,%r14
3356         adcq    $0,%r15
3357
3358
3359         movq    %rsi,%rax
3360
3361         subq    16+8(%rsp),%rsi
3362 .Lsqrx8x_no_tail:
3363         adcq    0(%rdi),%r8
3364 .byte   102,72,15,126,217
3365         adcq    8(%rdi),%r9
3366         movq    56(%rbp),%rsi
3367 .byte   102,72,15,126,213
3368         adcq    16(%rdi),%r10
3369         adcq    24(%rdi),%r11
3370         adcq    32(%rdi),%r12
3371         adcq    40(%rdi),%r13
3372         adcq    48(%rdi),%r14
3373         adcq    56(%rdi),%r15
3374         adcq    %rax,%rax
3375
3376         movq    32+8(%rsp),%rbx
3377         movq    64(%rdi,%rcx,1),%rdx
3378
3379         movq    %r8,0(%rdi)
3380         leaq    64(%rdi),%r8
3381         movq    %r9,8(%rdi)
3382         movq    %r10,16(%rdi)
3383         movq    %r11,24(%rdi)
3384         movq    %r12,32(%rdi)
3385         movq    %r13,40(%rdi)
3386         movq    %r14,48(%rdi)
3387         movq    %r15,56(%rdi)
3388
3389         leaq    64(%rdi,%rcx,1),%rdi
3390         cmpq    8+8(%rsp),%r8
3391         jb      .Lsqrx8x_reduction_loop
3392         .byte   0xf3,0xc3
3393 .size   bn_sqrx8x_internal,.-bn_sqrx8x_internal
3394 .align  32
3395 __bn_postx4x_internal:
3396         movq    0(%rbp),%r12
3397         movq    %rcx,%r10
3398         movq    %rcx,%r9
3399         negq    %rax
3400         sarq    $3+2,%rcx
3401
3402 .byte   102,72,15,126,202
3403 .byte   102,72,15,126,206
3404         decq    %r12
3405         movq    8(%rbp),%r13
3406         xorq    %r8,%r8
3407         movq    16(%rbp),%r14
3408         movq    24(%rbp),%r15
3409         jmp     .Lsqrx4x_sub_entry
3410
3411 .align  16
3412 .Lsqrx4x_sub:
3413         movq    0(%rbp),%r12
3414         movq    8(%rbp),%r13
3415         movq    16(%rbp),%r14
3416         movq    24(%rbp),%r15
3417 .Lsqrx4x_sub_entry:
3418         andnq   %rax,%r12,%r12
3419         leaq    32(%rbp),%rbp
3420         andnq   %rax,%r13,%r13
3421         andnq   %rax,%r14,%r14
3422         andnq   %rax,%r15,%r15
3423
3424         negq    %r8
3425         adcq    0(%rdi),%r12
3426         adcq    8(%rdi),%r13
3427         adcq    16(%rdi),%r14
3428         adcq    24(%rdi),%r15
3429         movq    %r12,0(%rdx)
3430         leaq    32(%rdi),%rdi
3431         movq    %r13,8(%rdx)
3432         sbbq    %r8,%r8
3433         movq    %r14,16(%rdx)
3434         movq    %r15,24(%rdx)
3435         leaq    32(%rdx),%rdx
3436
3437         incq    %rcx
3438         jnz     .Lsqrx4x_sub
3439
3440         negq    %r9
3441
3442         .byte   0xf3,0xc3
3443 .size   __bn_postx4x_internal,.-__bn_postx4x_internal
3444 .globl  bn_get_bits5
3445 .type   bn_get_bits5,@function
3446 .align  16
3447 bn_get_bits5:
3448         leaq    0(%rdi),%r10
3449         leaq    1(%rdi),%r11
3450         movl    %esi,%ecx
3451         shrl    $4,%esi
3452         andl    $15,%ecx
3453         leal    -8(%rcx),%eax
3454         cmpl    $11,%ecx
3455         cmovaq  %r11,%r10
3456         cmoval  %eax,%ecx
3457         movzwl  (%r10,%rsi,2),%eax
3458         shrl    %cl,%eax
3459         andl    $31,%eax
3460         .byte   0xf3,0xc3
3461 .size   bn_get_bits5,.-bn_get_bits5
3462
3463 .globl  bn_scatter5
3464 .type   bn_scatter5,@function
3465 .align  16
3466 bn_scatter5:
3467         cmpl    $0,%esi
3468         jz      .Lscatter_epilogue
3469         leaq    (%rdx,%rcx,8),%rdx
3470 .Lscatter:
3471         movq    (%rdi),%rax
3472         leaq    8(%rdi),%rdi
3473         movq    %rax,(%rdx)
3474         leaq    256(%rdx),%rdx
3475         subl    $1,%esi
3476         jnz     .Lscatter
3477 .Lscatter_epilogue:
3478         .byte   0xf3,0xc3
3479 .size   bn_scatter5,.-bn_scatter5
3480
3481 .globl  bn_gather5
3482 .type   bn_gather5,@function
3483 .align  32
3484 bn_gather5:
3485 .LSEH_begin_bn_gather5:
3486
3487 .byte   0x4c,0x8d,0x14,0x24
3488 .byte   0x48,0x81,0xec,0x08,0x01,0x00,0x00
3489         leaq    .Linc(%rip),%rax
3490         andq    $-16,%rsp
3491
3492         movd    %ecx,%xmm5
3493         movdqa  0(%rax),%xmm0
3494         movdqa  16(%rax),%xmm1
3495         leaq    128(%rdx),%r11
3496         leaq    128(%rsp),%rax
3497
3498         pshufd  $0,%xmm5,%xmm5
3499         movdqa  %xmm1,%xmm4
3500         movdqa  %xmm1,%xmm2
3501         paddd   %xmm0,%xmm1
3502         pcmpeqd %xmm5,%xmm0
3503         movdqa  %xmm4,%xmm3
3504
3505         paddd   %xmm1,%xmm2
3506         pcmpeqd %xmm5,%xmm1
3507         movdqa  %xmm0,-128(%rax)
3508         movdqa  %xmm4,%xmm0
3509
3510         paddd   %xmm2,%xmm3
3511         pcmpeqd %xmm5,%xmm2
3512         movdqa  %xmm1,-112(%rax)
3513         movdqa  %xmm4,%xmm1
3514
3515         paddd   %xmm3,%xmm0
3516         pcmpeqd %xmm5,%xmm3
3517         movdqa  %xmm2,-96(%rax)
3518         movdqa  %xmm4,%xmm2
3519         paddd   %xmm0,%xmm1
3520         pcmpeqd %xmm5,%xmm0
3521         movdqa  %xmm3,-80(%rax)
3522         movdqa  %xmm4,%xmm3
3523
3524         paddd   %xmm1,%xmm2
3525         pcmpeqd %xmm5,%xmm1
3526         movdqa  %xmm0,-64(%rax)
3527         movdqa  %xmm4,%xmm0
3528
3529         paddd   %xmm2,%xmm3
3530         pcmpeqd %xmm5,%xmm2
3531         movdqa  %xmm1,-48(%rax)
3532         movdqa  %xmm4,%xmm1
3533
3534         paddd   %xmm3,%xmm0
3535         pcmpeqd %xmm5,%xmm3
3536         movdqa  %xmm2,-32(%rax)
3537         movdqa  %xmm4,%xmm2
3538         paddd   %xmm0,%xmm1
3539         pcmpeqd %xmm5,%xmm0
3540         movdqa  %xmm3,-16(%rax)
3541         movdqa  %xmm4,%xmm3
3542
3543         paddd   %xmm1,%xmm2
3544         pcmpeqd %xmm5,%xmm1
3545         movdqa  %xmm0,0(%rax)
3546         movdqa  %xmm4,%xmm0
3547
3548         paddd   %xmm2,%xmm3
3549         pcmpeqd %xmm5,%xmm2
3550         movdqa  %xmm1,16(%rax)
3551         movdqa  %xmm4,%xmm1
3552
3553         paddd   %xmm3,%xmm0
3554         pcmpeqd %xmm5,%xmm3
3555         movdqa  %xmm2,32(%rax)
3556         movdqa  %xmm4,%xmm2
3557         paddd   %xmm0,%xmm1
3558         pcmpeqd %xmm5,%xmm0
3559         movdqa  %xmm3,48(%rax)
3560         movdqa  %xmm4,%xmm3
3561
3562         paddd   %xmm1,%xmm2
3563         pcmpeqd %xmm5,%xmm1
3564         movdqa  %xmm0,64(%rax)
3565         movdqa  %xmm4,%xmm0
3566
3567         paddd   %xmm2,%xmm3
3568         pcmpeqd %xmm5,%xmm2
3569         movdqa  %xmm1,80(%rax)
3570         movdqa  %xmm4,%xmm1
3571
3572         paddd   %xmm3,%xmm0
3573         pcmpeqd %xmm5,%xmm3
3574         movdqa  %xmm2,96(%rax)
3575         movdqa  %xmm4,%xmm2
3576         movdqa  %xmm3,112(%rax)
3577         jmp     .Lgather
3578
3579 .align  32
3580 .Lgather:
3581         pxor    %xmm4,%xmm4
3582         pxor    %xmm5,%xmm5
3583         movdqa  -128(%r11),%xmm0
3584         movdqa  -112(%r11),%xmm1
3585         movdqa  -96(%r11),%xmm2
3586         pand    -128(%rax),%xmm0
3587         movdqa  -80(%r11),%xmm3
3588         pand    -112(%rax),%xmm1
3589         por     %xmm0,%xmm4
3590         pand    -96(%rax),%xmm2
3591         por     %xmm1,%xmm5
3592         pand    -80(%rax),%xmm3
3593         por     %xmm2,%xmm4
3594         por     %xmm3,%xmm5
3595         movdqa  -64(%r11),%xmm0
3596         movdqa  -48(%r11),%xmm1
3597         movdqa  -32(%r11),%xmm2
3598         pand    -64(%rax),%xmm0
3599         movdqa  -16(%r11),%xmm3
3600         pand    -48(%rax),%xmm1
3601         por     %xmm0,%xmm4
3602         pand    -32(%rax),%xmm2
3603         por     %xmm1,%xmm5
3604         pand    -16(%rax),%xmm3
3605         por     %xmm2,%xmm4
3606         por     %xmm3,%xmm5
3607         movdqa  0(%r11),%xmm0
3608         movdqa  16(%r11),%xmm1
3609         movdqa  32(%r11),%xmm2
3610         pand    0(%rax),%xmm0
3611         movdqa  48(%r11),%xmm3
3612         pand    16(%rax),%xmm1
3613         por     %xmm0,%xmm4
3614         pand    32(%rax),%xmm2
3615         por     %xmm1,%xmm5
3616         pand    48(%rax),%xmm3
3617         por     %xmm2,%xmm4
3618         por     %xmm3,%xmm5
3619         movdqa  64(%r11),%xmm0
3620         movdqa  80(%r11),%xmm1
3621         movdqa  96(%r11),%xmm2
3622         pand    64(%rax),%xmm0
3623         movdqa  112(%r11),%xmm3
3624         pand    80(%rax),%xmm1
3625         por     %xmm0,%xmm4
3626         pand    96(%rax),%xmm2
3627         por     %xmm1,%xmm5
3628         pand    112(%rax),%xmm3
3629         por     %xmm2,%xmm4
3630         por     %xmm3,%xmm5
3631         por     %xmm5,%xmm4
3632         leaq    256(%r11),%r11
3633         pshufd  $0x4e,%xmm4,%xmm0
3634         por     %xmm4,%xmm0
3635         movq    %xmm0,(%rdi)
3636         leaq    8(%rdi),%rdi
3637         subl    $1,%esi
3638         jnz     .Lgather
3639
3640         leaq    (%r10),%rsp
3641         .byte   0xf3,0xc3
3642 .LSEH_end_bn_gather5:
3643 .size   bn_gather5,.-bn_gather5
3644 .align  64
3645 .Linc:
3646 .long   0,0, 1,1
3647 .long   2,2, 2,2
3648 .byte   77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0