]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/amd64/x86_64-mont5.S
Since contrib/libcxxrt's ancestry was never correct, subversion 1.8 and
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / amd64 / x86_64-mont5.S
1         # $FreeBSD$
2 .text   
3
4
5
6 .globl  bn_mul_mont_gather5
7 .type   bn_mul_mont_gather5,@function
8 .align  64
9 bn_mul_mont_gather5:
10         testl   $7,%r9d
11         jnz     .Lmul_enter
12         jmp     .Lmul4x_enter
13
14 .align  16
15 .Lmul_enter:
16         movl    %r9d,%r9d
17         movq    %rsp,%rax
18         movd    8(%rsp),%xmm5
19         leaq    .Linc(%rip),%r10
20         pushq   %rbx
21         pushq   %rbp
22         pushq   %r12
23         pushq   %r13
24         pushq   %r14
25         pushq   %r15
26
27         leaq    2(%r9),%r11
28         negq    %r11
29         leaq    -264(%rsp,%r11,8),%rsp
30         andq    $-1024,%rsp
31
32         movq    %rax,8(%rsp,%r9,8)
33 .Lmul_body:
34
35
36
37
38
39
40         subq    %rsp,%rax
41         andq    $-4096,%rax
42 .Lmul_page_walk:
43         movq    (%rsp,%rax,1),%r11
44         subq    $4096,%rax
45 .byte   0x2e
46         jnc     .Lmul_page_walk
47
48         leaq    128(%rdx),%r12
49         movdqa  0(%r10),%xmm0
50         movdqa  16(%r10),%xmm1
51         leaq    24-112(%rsp,%r9,8),%r10
52         andq    $-16,%r10
53
54         pshufd  $0,%xmm5,%xmm5
55         movdqa  %xmm1,%xmm4
56         movdqa  %xmm1,%xmm2
57         paddd   %xmm0,%xmm1
58         pcmpeqd %xmm5,%xmm0
59 .byte   0x67
60         movdqa  %xmm4,%xmm3
61         paddd   %xmm1,%xmm2
62         pcmpeqd %xmm5,%xmm1
63         movdqa  %xmm0,112(%r10)
64         movdqa  %xmm4,%xmm0
65
66         paddd   %xmm2,%xmm3
67         pcmpeqd %xmm5,%xmm2
68         movdqa  %xmm1,128(%r10)
69         movdqa  %xmm4,%xmm1
70
71         paddd   %xmm3,%xmm0
72         pcmpeqd %xmm5,%xmm3
73         movdqa  %xmm2,144(%r10)
74         movdqa  %xmm4,%xmm2
75
76         paddd   %xmm0,%xmm1
77         pcmpeqd %xmm5,%xmm0
78         movdqa  %xmm3,160(%r10)
79         movdqa  %xmm4,%xmm3
80         paddd   %xmm1,%xmm2
81         pcmpeqd %xmm5,%xmm1
82         movdqa  %xmm0,176(%r10)
83         movdqa  %xmm4,%xmm0
84
85         paddd   %xmm2,%xmm3
86         pcmpeqd %xmm5,%xmm2
87         movdqa  %xmm1,192(%r10)
88         movdqa  %xmm4,%xmm1
89
90         paddd   %xmm3,%xmm0
91         pcmpeqd %xmm5,%xmm3
92         movdqa  %xmm2,208(%r10)
93         movdqa  %xmm4,%xmm2
94
95         paddd   %xmm0,%xmm1
96         pcmpeqd %xmm5,%xmm0
97         movdqa  %xmm3,224(%r10)
98         movdqa  %xmm4,%xmm3
99         paddd   %xmm1,%xmm2
100         pcmpeqd %xmm5,%xmm1
101         movdqa  %xmm0,240(%r10)
102         movdqa  %xmm4,%xmm0
103
104         paddd   %xmm2,%xmm3
105         pcmpeqd %xmm5,%xmm2
106         movdqa  %xmm1,256(%r10)
107         movdqa  %xmm4,%xmm1
108
109         paddd   %xmm3,%xmm0
110         pcmpeqd %xmm5,%xmm3
111         movdqa  %xmm2,272(%r10)
112         movdqa  %xmm4,%xmm2
113
114         paddd   %xmm0,%xmm1
115         pcmpeqd %xmm5,%xmm0
116         movdqa  %xmm3,288(%r10)
117         movdqa  %xmm4,%xmm3
118         paddd   %xmm1,%xmm2
119         pcmpeqd %xmm5,%xmm1
120         movdqa  %xmm0,304(%r10)
121
122         paddd   %xmm2,%xmm3
123 .byte   0x67
124         pcmpeqd %xmm5,%xmm2
125         movdqa  %xmm1,320(%r10)
126
127         pcmpeqd %xmm5,%xmm3
128         movdqa  %xmm2,336(%r10)
129         pand    64(%r12),%xmm0
130
131         pand    80(%r12),%xmm1
132         pand    96(%r12),%xmm2
133         movdqa  %xmm3,352(%r10)
134         pand    112(%r12),%xmm3
135         por     %xmm2,%xmm0
136         por     %xmm3,%xmm1
137         movdqa  -128(%r12),%xmm4
138         movdqa  -112(%r12),%xmm5
139         movdqa  -96(%r12),%xmm2
140         pand    112(%r10),%xmm4
141         movdqa  -80(%r12),%xmm3
142         pand    128(%r10),%xmm5
143         por     %xmm4,%xmm0
144         pand    144(%r10),%xmm2
145         por     %xmm5,%xmm1
146         pand    160(%r10),%xmm3
147         por     %xmm2,%xmm0
148         por     %xmm3,%xmm1
149         movdqa  -64(%r12),%xmm4
150         movdqa  -48(%r12),%xmm5
151         movdqa  -32(%r12),%xmm2
152         pand    176(%r10),%xmm4
153         movdqa  -16(%r12),%xmm3
154         pand    192(%r10),%xmm5
155         por     %xmm4,%xmm0
156         pand    208(%r10),%xmm2
157         por     %xmm5,%xmm1
158         pand    224(%r10),%xmm3
159         por     %xmm2,%xmm0
160         por     %xmm3,%xmm1
161         movdqa  0(%r12),%xmm4
162         movdqa  16(%r12),%xmm5
163         movdqa  32(%r12),%xmm2
164         pand    240(%r10),%xmm4
165         movdqa  48(%r12),%xmm3
166         pand    256(%r10),%xmm5
167         por     %xmm4,%xmm0
168         pand    272(%r10),%xmm2
169         por     %xmm5,%xmm1
170         pand    288(%r10),%xmm3
171         por     %xmm2,%xmm0
172         por     %xmm3,%xmm1
173         por     %xmm1,%xmm0
174         pshufd  $0x4e,%xmm0,%xmm1
175         por     %xmm1,%xmm0
176         leaq    256(%r12),%r12
177 .byte   102,72,15,126,195
178
179         movq    (%r8),%r8
180         movq    (%rsi),%rax
181
182         xorq    %r14,%r14
183         xorq    %r15,%r15
184
185         movq    %r8,%rbp
186         mulq    %rbx
187         movq    %rax,%r10
188         movq    (%rcx),%rax
189
190         imulq   %r10,%rbp
191         movq    %rdx,%r11
192
193         mulq    %rbp
194         addq    %rax,%r10
195         movq    8(%rsi),%rax
196         adcq    $0,%rdx
197         movq    %rdx,%r13
198
199         leaq    1(%r15),%r15
200         jmp     .L1st_enter
201
202 .align  16
203 .L1st:
204         addq    %rax,%r13
205         movq    (%rsi,%r15,8),%rax
206         adcq    $0,%rdx
207         addq    %r11,%r13
208         movq    %r10,%r11
209         adcq    $0,%rdx
210         movq    %r13,-16(%rsp,%r15,8)
211         movq    %rdx,%r13
212
213 .L1st_enter:
214         mulq    %rbx
215         addq    %rax,%r11
216         movq    (%rcx,%r15,8),%rax
217         adcq    $0,%rdx
218         leaq    1(%r15),%r15
219         movq    %rdx,%r10
220
221         mulq    %rbp
222         cmpq    %r9,%r15
223         jne     .L1st
224
225
226         addq    %rax,%r13
227         adcq    $0,%rdx
228         addq    %r11,%r13
229         adcq    $0,%rdx
230         movq    %r13,-16(%rsp,%r9,8)
231         movq    %rdx,%r13
232         movq    %r10,%r11
233
234         xorq    %rdx,%rdx
235         addq    %r11,%r13
236         adcq    $0,%rdx
237         movq    %r13,-8(%rsp,%r9,8)
238         movq    %rdx,(%rsp,%r9,8)
239
240         leaq    1(%r14),%r14
241         jmp     .Louter
242 .align  16
243 .Louter:
244         leaq    24+128(%rsp,%r9,8),%rdx
245         andq    $-16,%rdx
246         pxor    %xmm4,%xmm4
247         pxor    %xmm5,%xmm5
248         movdqa  -128(%r12),%xmm0
249         movdqa  -112(%r12),%xmm1
250         movdqa  -96(%r12),%xmm2
251         movdqa  -80(%r12),%xmm3
252         pand    -128(%rdx),%xmm0
253         pand    -112(%rdx),%xmm1
254         por     %xmm0,%xmm4
255         pand    -96(%rdx),%xmm2
256         por     %xmm1,%xmm5
257         pand    -80(%rdx),%xmm3
258         por     %xmm2,%xmm4
259         por     %xmm3,%xmm5
260         movdqa  -64(%r12),%xmm0
261         movdqa  -48(%r12),%xmm1
262         movdqa  -32(%r12),%xmm2
263         movdqa  -16(%r12),%xmm3
264         pand    -64(%rdx),%xmm0
265         pand    -48(%rdx),%xmm1
266         por     %xmm0,%xmm4
267         pand    -32(%rdx),%xmm2
268         por     %xmm1,%xmm5
269         pand    -16(%rdx),%xmm3
270         por     %xmm2,%xmm4
271         por     %xmm3,%xmm5
272         movdqa  0(%r12),%xmm0
273         movdqa  16(%r12),%xmm1
274         movdqa  32(%r12),%xmm2
275         movdqa  48(%r12),%xmm3
276         pand    0(%rdx),%xmm0
277         pand    16(%rdx),%xmm1
278         por     %xmm0,%xmm4
279         pand    32(%rdx),%xmm2
280         por     %xmm1,%xmm5
281         pand    48(%rdx),%xmm3
282         por     %xmm2,%xmm4
283         por     %xmm3,%xmm5
284         movdqa  64(%r12),%xmm0
285         movdqa  80(%r12),%xmm1
286         movdqa  96(%r12),%xmm2
287         movdqa  112(%r12),%xmm3
288         pand    64(%rdx),%xmm0
289         pand    80(%rdx),%xmm1
290         por     %xmm0,%xmm4
291         pand    96(%rdx),%xmm2
292         por     %xmm1,%xmm5
293         pand    112(%rdx),%xmm3
294         por     %xmm2,%xmm4
295         por     %xmm3,%xmm5
296         por     %xmm5,%xmm4
297         pshufd  $0x4e,%xmm4,%xmm0
298         por     %xmm4,%xmm0
299         leaq    256(%r12),%r12
300
301         movq    (%rsi),%rax
302 .byte   102,72,15,126,195
303
304         xorq    %r15,%r15
305         movq    %r8,%rbp
306         movq    (%rsp),%r10
307
308         mulq    %rbx
309         addq    %rax,%r10
310         movq    (%rcx),%rax
311         adcq    $0,%rdx
312
313         imulq   %r10,%rbp
314         movq    %rdx,%r11
315
316         mulq    %rbp
317         addq    %rax,%r10
318         movq    8(%rsi),%rax
319         adcq    $0,%rdx
320         movq    8(%rsp),%r10
321         movq    %rdx,%r13
322
323         leaq    1(%r15),%r15
324         jmp     .Linner_enter
325
326 .align  16
327 .Linner:
328         addq    %rax,%r13
329         movq    (%rsi,%r15,8),%rax
330         adcq    $0,%rdx
331         addq    %r10,%r13
332         movq    (%rsp,%r15,8),%r10
333         adcq    $0,%rdx
334         movq    %r13,-16(%rsp,%r15,8)
335         movq    %rdx,%r13
336
337 .Linner_enter:
338         mulq    %rbx
339         addq    %rax,%r11
340         movq    (%rcx,%r15,8),%rax
341         adcq    $0,%rdx
342         addq    %r11,%r10
343         movq    %rdx,%r11
344         adcq    $0,%r11
345         leaq    1(%r15),%r15
346
347         mulq    %rbp
348         cmpq    %r9,%r15
349         jne     .Linner
350
351         addq    %rax,%r13
352         adcq    $0,%rdx
353         addq    %r10,%r13
354         movq    (%rsp,%r9,8),%r10
355         adcq    $0,%rdx
356         movq    %r13,-16(%rsp,%r9,8)
357         movq    %rdx,%r13
358
359         xorq    %rdx,%rdx
360         addq    %r11,%r13
361         adcq    $0,%rdx
362         addq    %r10,%r13
363         adcq    $0,%rdx
364         movq    %r13,-8(%rsp,%r9,8)
365         movq    %rdx,(%rsp,%r9,8)
366
367         leaq    1(%r14),%r14
368         cmpq    %r9,%r14
369         jb      .Louter
370
371         xorq    %r14,%r14
372         movq    (%rsp),%rax
373         leaq    (%rsp),%rsi
374         movq    %r9,%r15
375         jmp     .Lsub
376 .align  16
377 .Lsub:  sbbq    (%rcx,%r14,8),%rax
378         movq    %rax,(%rdi,%r14,8)
379         movq    8(%rsi,%r14,8),%rax
380         leaq    1(%r14),%r14
381         decq    %r15
382         jnz     .Lsub
383
384         sbbq    $0,%rax
385         xorq    %r14,%r14
386         andq    %rax,%rsi
387         notq    %rax
388         movq    %rdi,%rcx
389         andq    %rax,%rcx
390         movq    %r9,%r15
391         orq     %rcx,%rsi
392 .align  16
393 .Lcopy:
394         movq    (%rsi,%r14,8),%rax
395         movq    %r14,(%rsp,%r14,8)
396         movq    %rax,(%rdi,%r14,8)
397         leaq    1(%r14),%r14
398         subq    $1,%r15
399         jnz     .Lcopy
400
401         movq    8(%rsp,%r9,8),%rsi
402         movq    $1,%rax
403
404         movq    -48(%rsi),%r15
405         movq    -40(%rsi),%r14
406         movq    -32(%rsi),%r13
407         movq    -24(%rsi),%r12
408         movq    -16(%rsi),%rbp
409         movq    -8(%rsi),%rbx
410         leaq    (%rsi),%rsp
411 .Lmul_epilogue:
412         .byte   0xf3,0xc3
413 .size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
414 .type   bn_mul4x_mont_gather5,@function
415 .align  32
416 bn_mul4x_mont_gather5:
417 .Lmul4x_enter:
418 .byte   0x67
419         movq    %rsp,%rax
420         pushq   %rbx
421         pushq   %rbp
422         pushq   %r12
423         pushq   %r13
424         pushq   %r14
425         pushq   %r15
426
427 .byte   0x67
428         shll    $3,%r9d
429         leaq    (%r9,%r9,2),%r10
430         negq    %r9
431
432
433
434
435
436
437
438
439
440
441         leaq    -320(%rsp,%r9,2),%r11
442         subq    %rdi,%r11
443         andq    $4095,%r11
444         cmpq    %r11,%r10
445         jb      .Lmul4xsp_alt
446         subq    %r11,%rsp
447         leaq    -320(%rsp,%r9,2),%rsp
448         jmp     .Lmul4xsp_done
449
450 .align  32
451 .Lmul4xsp_alt:
452         leaq    4096-320(,%r9,2),%r10
453         leaq    -320(%rsp,%r9,2),%rsp
454         subq    %r10,%r11
455         movq    $0,%r10
456         cmovcq  %r10,%r11
457         subq    %r11,%rsp
458 .Lmul4xsp_done:
459         andq    $-64,%rsp
460         movq    %rax,%r11
461         subq    %rsp,%r11
462         andq    $-4096,%r11
463 .Lmul4x_page_walk:
464         movq    (%rsp,%r11,1),%r10
465         subq    $4096,%r11
466 .byte   0x2e
467         jnc     .Lmul4x_page_walk
468
469         negq    %r9
470
471         movq    %rax,40(%rsp)
472 .Lmul4x_body:
473
474         call    mul4x_internal
475
476         movq    40(%rsp),%rsi
477         movq    $1,%rax
478
479         movq    -48(%rsi),%r15
480         movq    -40(%rsi),%r14
481         movq    -32(%rsi),%r13
482         movq    -24(%rsi),%r12
483         movq    -16(%rsi),%rbp
484         movq    -8(%rsi),%rbx
485         leaq    (%rsi),%rsp
486 .Lmul4x_epilogue:
487         .byte   0xf3,0xc3
488 .size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
489
490 .type   mul4x_internal,@function
491 .align  32
492 mul4x_internal:
493         shlq    $5,%r9
494         movd    8(%rax),%xmm5
495         leaq    .Linc(%rip),%rax
496         leaq    128(%rdx,%r9,1),%r13
497         shrq    $5,%r9
498         movdqa  0(%rax),%xmm0
499         movdqa  16(%rax),%xmm1
500         leaq    88-112(%rsp,%r9,1),%r10
501         leaq    128(%rdx),%r12
502
503         pshufd  $0,%xmm5,%xmm5
504         movdqa  %xmm1,%xmm4
505 .byte   0x67,0x67
506         movdqa  %xmm1,%xmm2
507         paddd   %xmm0,%xmm1
508         pcmpeqd %xmm5,%xmm0
509 .byte   0x67
510         movdqa  %xmm4,%xmm3
511         paddd   %xmm1,%xmm2
512         pcmpeqd %xmm5,%xmm1
513         movdqa  %xmm0,112(%r10)
514         movdqa  %xmm4,%xmm0
515
516         paddd   %xmm2,%xmm3
517         pcmpeqd %xmm5,%xmm2
518         movdqa  %xmm1,128(%r10)
519         movdqa  %xmm4,%xmm1
520
521         paddd   %xmm3,%xmm0
522         pcmpeqd %xmm5,%xmm3
523         movdqa  %xmm2,144(%r10)
524         movdqa  %xmm4,%xmm2
525
526         paddd   %xmm0,%xmm1
527         pcmpeqd %xmm5,%xmm0
528         movdqa  %xmm3,160(%r10)
529         movdqa  %xmm4,%xmm3
530         paddd   %xmm1,%xmm2
531         pcmpeqd %xmm5,%xmm1
532         movdqa  %xmm0,176(%r10)
533         movdqa  %xmm4,%xmm0
534
535         paddd   %xmm2,%xmm3
536         pcmpeqd %xmm5,%xmm2
537         movdqa  %xmm1,192(%r10)
538         movdqa  %xmm4,%xmm1
539
540         paddd   %xmm3,%xmm0
541         pcmpeqd %xmm5,%xmm3
542         movdqa  %xmm2,208(%r10)
543         movdqa  %xmm4,%xmm2
544
545         paddd   %xmm0,%xmm1
546         pcmpeqd %xmm5,%xmm0
547         movdqa  %xmm3,224(%r10)
548         movdqa  %xmm4,%xmm3
549         paddd   %xmm1,%xmm2
550         pcmpeqd %xmm5,%xmm1
551         movdqa  %xmm0,240(%r10)
552         movdqa  %xmm4,%xmm0
553
554         paddd   %xmm2,%xmm3
555         pcmpeqd %xmm5,%xmm2
556         movdqa  %xmm1,256(%r10)
557         movdqa  %xmm4,%xmm1
558
559         paddd   %xmm3,%xmm0
560         pcmpeqd %xmm5,%xmm3
561         movdqa  %xmm2,272(%r10)
562         movdqa  %xmm4,%xmm2
563
564         paddd   %xmm0,%xmm1
565         pcmpeqd %xmm5,%xmm0
566         movdqa  %xmm3,288(%r10)
567         movdqa  %xmm4,%xmm3
568         paddd   %xmm1,%xmm2
569         pcmpeqd %xmm5,%xmm1
570         movdqa  %xmm0,304(%r10)
571
572         paddd   %xmm2,%xmm3
573 .byte   0x67
574         pcmpeqd %xmm5,%xmm2
575         movdqa  %xmm1,320(%r10)
576
577         pcmpeqd %xmm5,%xmm3
578         movdqa  %xmm2,336(%r10)
579         pand    64(%r12),%xmm0
580
581         pand    80(%r12),%xmm1
582         pand    96(%r12),%xmm2
583         movdqa  %xmm3,352(%r10)
584         pand    112(%r12),%xmm3
585         por     %xmm2,%xmm0
586         por     %xmm3,%xmm1
587         movdqa  -128(%r12),%xmm4
588         movdqa  -112(%r12),%xmm5
589         movdqa  -96(%r12),%xmm2
590         pand    112(%r10),%xmm4
591         movdqa  -80(%r12),%xmm3
592         pand    128(%r10),%xmm5
593         por     %xmm4,%xmm0
594         pand    144(%r10),%xmm2
595         por     %xmm5,%xmm1
596         pand    160(%r10),%xmm3
597         por     %xmm2,%xmm0
598         por     %xmm3,%xmm1
599         movdqa  -64(%r12),%xmm4
600         movdqa  -48(%r12),%xmm5
601         movdqa  -32(%r12),%xmm2
602         pand    176(%r10),%xmm4
603         movdqa  -16(%r12),%xmm3
604         pand    192(%r10),%xmm5
605         por     %xmm4,%xmm0
606         pand    208(%r10),%xmm2
607         por     %xmm5,%xmm1
608         pand    224(%r10),%xmm3
609         por     %xmm2,%xmm0
610         por     %xmm3,%xmm1
611         movdqa  0(%r12),%xmm4
612         movdqa  16(%r12),%xmm5
613         movdqa  32(%r12),%xmm2
614         pand    240(%r10),%xmm4
615         movdqa  48(%r12),%xmm3
616         pand    256(%r10),%xmm5
617         por     %xmm4,%xmm0
618         pand    272(%r10),%xmm2
619         por     %xmm5,%xmm1
620         pand    288(%r10),%xmm3
621         por     %xmm2,%xmm0
622         por     %xmm3,%xmm1
623         por     %xmm1,%xmm0
624         pshufd  $0x4e,%xmm0,%xmm1
625         por     %xmm1,%xmm0
626         leaq    256(%r12),%r12
627 .byte   102,72,15,126,195
628
629         movq    %r13,16+8(%rsp)
630         movq    %rdi,56+8(%rsp)
631
632         movq    (%r8),%r8
633         movq    (%rsi),%rax
634         leaq    (%rsi,%r9,1),%rsi
635         negq    %r9
636
637         movq    %r8,%rbp
638         mulq    %rbx
639         movq    %rax,%r10
640         movq    (%rcx),%rax
641
642         imulq   %r10,%rbp
643         leaq    64+8(%rsp),%r14
644         movq    %rdx,%r11
645
646         mulq    %rbp
647         addq    %rax,%r10
648         movq    8(%rsi,%r9,1),%rax
649         adcq    $0,%rdx
650         movq    %rdx,%rdi
651
652         mulq    %rbx
653         addq    %rax,%r11
654         movq    8(%rcx),%rax
655         adcq    $0,%rdx
656         movq    %rdx,%r10
657
658         mulq    %rbp
659         addq    %rax,%rdi
660         movq    16(%rsi,%r9,1),%rax
661         adcq    $0,%rdx
662         addq    %r11,%rdi
663         leaq    32(%r9),%r15
664         leaq    32(%rcx),%rcx
665         adcq    $0,%rdx
666         movq    %rdi,(%r14)
667         movq    %rdx,%r13
668         jmp     .L1st4x
669
670 .align  32
671 .L1st4x:
672         mulq    %rbx
673         addq    %rax,%r10
674         movq    -16(%rcx),%rax
675         leaq    32(%r14),%r14
676         adcq    $0,%rdx
677         movq    %rdx,%r11
678
679         mulq    %rbp
680         addq    %rax,%r13
681         movq    -8(%rsi,%r15,1),%rax
682         adcq    $0,%rdx
683         addq    %r10,%r13
684         adcq    $0,%rdx
685         movq    %r13,-24(%r14)
686         movq    %rdx,%rdi
687
688         mulq    %rbx
689         addq    %rax,%r11
690         movq    -8(%rcx),%rax
691         adcq    $0,%rdx
692         movq    %rdx,%r10
693
694         mulq    %rbp
695         addq    %rax,%rdi
696         movq    (%rsi,%r15,1),%rax
697         adcq    $0,%rdx
698         addq    %r11,%rdi
699         adcq    $0,%rdx
700         movq    %rdi,-16(%r14)
701         movq    %rdx,%r13
702
703         mulq    %rbx
704         addq    %rax,%r10
705         movq    0(%rcx),%rax
706         adcq    $0,%rdx
707         movq    %rdx,%r11
708
709         mulq    %rbp
710         addq    %rax,%r13
711         movq    8(%rsi,%r15,1),%rax
712         adcq    $0,%rdx
713         addq    %r10,%r13
714         adcq    $0,%rdx
715         movq    %r13,-8(%r14)
716         movq    %rdx,%rdi
717
718         mulq    %rbx
719         addq    %rax,%r11
720         movq    8(%rcx),%rax
721         adcq    $0,%rdx
722         movq    %rdx,%r10
723
724         mulq    %rbp
725         addq    %rax,%rdi
726         movq    16(%rsi,%r15,1),%rax
727         adcq    $0,%rdx
728         addq    %r11,%rdi
729         leaq    32(%rcx),%rcx
730         adcq    $0,%rdx
731         movq    %rdi,(%r14)
732         movq    %rdx,%r13
733
734         addq    $32,%r15
735         jnz     .L1st4x
736
737         mulq    %rbx
738         addq    %rax,%r10
739         movq    -16(%rcx),%rax
740         leaq    32(%r14),%r14
741         adcq    $0,%rdx
742         movq    %rdx,%r11
743
744         mulq    %rbp
745         addq    %rax,%r13
746         movq    -8(%rsi),%rax
747         adcq    $0,%rdx
748         addq    %r10,%r13
749         adcq    $0,%rdx
750         movq    %r13,-24(%r14)
751         movq    %rdx,%rdi
752
753         mulq    %rbx
754         addq    %rax,%r11
755         movq    -8(%rcx),%rax
756         adcq    $0,%rdx
757         movq    %rdx,%r10
758
759         mulq    %rbp
760         addq    %rax,%rdi
761         movq    (%rsi,%r9,1),%rax
762         adcq    $0,%rdx
763         addq    %r11,%rdi
764         adcq    $0,%rdx
765         movq    %rdi,-16(%r14)
766         movq    %rdx,%r13
767
768         leaq    (%rcx,%r9,1),%rcx
769
770         xorq    %rdi,%rdi
771         addq    %r10,%r13
772         adcq    $0,%rdi
773         movq    %r13,-8(%r14)
774
775         jmp     .Louter4x
776
777 .align  32
778 .Louter4x:
779         leaq    16+128(%r14),%rdx
780         pxor    %xmm4,%xmm4
781         pxor    %xmm5,%xmm5
782         movdqa  -128(%r12),%xmm0
783         movdqa  -112(%r12),%xmm1
784         movdqa  -96(%r12),%xmm2
785         movdqa  -80(%r12),%xmm3
786         pand    -128(%rdx),%xmm0
787         pand    -112(%rdx),%xmm1
788         por     %xmm0,%xmm4
789         pand    -96(%rdx),%xmm2
790         por     %xmm1,%xmm5
791         pand    -80(%rdx),%xmm3
792         por     %xmm2,%xmm4
793         por     %xmm3,%xmm5
794         movdqa  -64(%r12),%xmm0
795         movdqa  -48(%r12),%xmm1
796         movdqa  -32(%r12),%xmm2
797         movdqa  -16(%r12),%xmm3
798         pand    -64(%rdx),%xmm0
799         pand    -48(%rdx),%xmm1
800         por     %xmm0,%xmm4
801         pand    -32(%rdx),%xmm2
802         por     %xmm1,%xmm5
803         pand    -16(%rdx),%xmm3
804         por     %xmm2,%xmm4
805         por     %xmm3,%xmm5
806         movdqa  0(%r12),%xmm0
807         movdqa  16(%r12),%xmm1
808         movdqa  32(%r12),%xmm2
809         movdqa  48(%r12),%xmm3
810         pand    0(%rdx),%xmm0
811         pand    16(%rdx),%xmm1
812         por     %xmm0,%xmm4
813         pand    32(%rdx),%xmm2
814         por     %xmm1,%xmm5
815         pand    48(%rdx),%xmm3
816         por     %xmm2,%xmm4
817         por     %xmm3,%xmm5
818         movdqa  64(%r12),%xmm0
819         movdqa  80(%r12),%xmm1
820         movdqa  96(%r12),%xmm2
821         movdqa  112(%r12),%xmm3
822         pand    64(%rdx),%xmm0
823         pand    80(%rdx),%xmm1
824         por     %xmm0,%xmm4
825         pand    96(%rdx),%xmm2
826         por     %xmm1,%xmm5
827         pand    112(%rdx),%xmm3
828         por     %xmm2,%xmm4
829         por     %xmm3,%xmm5
830         por     %xmm5,%xmm4
831         pshufd  $0x4e,%xmm4,%xmm0
832         por     %xmm4,%xmm0
833         leaq    256(%r12),%r12
834 .byte   102,72,15,126,195
835
836         movq    (%r14,%r9,1),%r10
837         movq    %r8,%rbp
838         mulq    %rbx
839         addq    %rax,%r10
840         movq    (%rcx),%rax
841         adcq    $0,%rdx
842
843         imulq   %r10,%rbp
844         movq    %rdx,%r11
845         movq    %rdi,(%r14)
846
847         leaq    (%r14,%r9,1),%r14
848
849         mulq    %rbp
850         addq    %rax,%r10
851         movq    8(%rsi,%r9,1),%rax
852         adcq    $0,%rdx
853         movq    %rdx,%rdi
854
855         mulq    %rbx
856         addq    %rax,%r11
857         movq    8(%rcx),%rax
858         adcq    $0,%rdx
859         addq    8(%r14),%r11
860         adcq    $0,%rdx
861         movq    %rdx,%r10
862
863         mulq    %rbp
864         addq    %rax,%rdi
865         movq    16(%rsi,%r9,1),%rax
866         adcq    $0,%rdx
867         addq    %r11,%rdi
868         leaq    32(%r9),%r15
869         leaq    32(%rcx),%rcx
870         adcq    $0,%rdx
871         movq    %rdx,%r13
872         jmp     .Linner4x
873
874 .align  32
875 .Linner4x:
876         mulq    %rbx
877         addq    %rax,%r10
878         movq    -16(%rcx),%rax
879         adcq    $0,%rdx
880         addq    16(%r14),%r10
881         leaq    32(%r14),%r14
882         adcq    $0,%rdx
883         movq    %rdx,%r11
884
885         mulq    %rbp
886         addq    %rax,%r13
887         movq    -8(%rsi,%r15,1),%rax
888         adcq    $0,%rdx
889         addq    %r10,%r13
890         adcq    $0,%rdx
891         movq    %rdi,-32(%r14)
892         movq    %rdx,%rdi
893
894         mulq    %rbx
895         addq    %rax,%r11
896         movq    -8(%rcx),%rax
897         adcq    $0,%rdx
898         addq    -8(%r14),%r11
899         adcq    $0,%rdx
900         movq    %rdx,%r10
901
902         mulq    %rbp
903         addq    %rax,%rdi
904         movq    (%rsi,%r15,1),%rax
905         adcq    $0,%rdx
906         addq    %r11,%rdi
907         adcq    $0,%rdx
908         movq    %r13,-24(%r14)
909         movq    %rdx,%r13
910
911         mulq    %rbx
912         addq    %rax,%r10
913         movq    0(%rcx),%rax
914         adcq    $0,%rdx
915         addq    (%r14),%r10
916         adcq    $0,%rdx
917         movq    %rdx,%r11
918
919         mulq    %rbp
920         addq    %rax,%r13
921         movq    8(%rsi,%r15,1),%rax
922         adcq    $0,%rdx
923         addq    %r10,%r13
924         adcq    $0,%rdx
925         movq    %rdi,-16(%r14)
926         movq    %rdx,%rdi
927
928         mulq    %rbx
929         addq    %rax,%r11
930         movq    8(%rcx),%rax
931         adcq    $0,%rdx
932         addq    8(%r14),%r11
933         adcq    $0,%rdx
934         movq    %rdx,%r10
935
936         mulq    %rbp
937         addq    %rax,%rdi
938         movq    16(%rsi,%r15,1),%rax
939         adcq    $0,%rdx
940         addq    %r11,%rdi
941         leaq    32(%rcx),%rcx
942         adcq    $0,%rdx
943         movq    %r13,-8(%r14)
944         movq    %rdx,%r13
945
946         addq    $32,%r15
947         jnz     .Linner4x
948
949         mulq    %rbx
950         addq    %rax,%r10
951         movq    -16(%rcx),%rax
952         adcq    $0,%rdx
953         addq    16(%r14),%r10
954         leaq    32(%r14),%r14
955         adcq    $0,%rdx
956         movq    %rdx,%r11
957
958         mulq    %rbp
959         addq    %rax,%r13
960         movq    -8(%rsi),%rax
961         adcq    $0,%rdx
962         addq    %r10,%r13
963         adcq    $0,%rdx
964         movq    %rdi,-32(%r14)
965         movq    %rdx,%rdi
966
967         mulq    %rbx
968         addq    %rax,%r11
969         movq    %rbp,%rax
970         movq    -8(%rcx),%rbp
971         adcq    $0,%rdx
972         addq    -8(%r14),%r11
973         adcq    $0,%rdx
974         movq    %rdx,%r10
975
976         mulq    %rbp
977         addq    %rax,%rdi
978         movq    (%rsi,%r9,1),%rax
979         adcq    $0,%rdx
980         addq    %r11,%rdi
981         adcq    $0,%rdx
982         movq    %r13,-24(%r14)
983         movq    %rdx,%r13
984
985         movq    %rdi,-16(%r14)
986         leaq    (%rcx,%r9,1),%rcx
987
988         xorq    %rdi,%rdi
989         addq    %r10,%r13
990         adcq    $0,%rdi
991         addq    (%r14),%r13
992         adcq    $0,%rdi
993         movq    %r13,-8(%r14)
994
995         cmpq    16+8(%rsp),%r12
996         jb      .Louter4x
997         xorq    %rax,%rax
998         subq    %r13,%rbp
999         adcq    %r15,%r15
1000         orq     %r15,%rdi
1001         subq    %rdi,%rax
1002         leaq    (%r14,%r9,1),%rbx
1003         movq    (%rcx),%r12
1004         leaq    (%rcx),%rbp
1005         movq    %r9,%rcx
1006         sarq    $3+2,%rcx
1007         movq    56+8(%rsp),%rdi
1008         decq    %r12
1009         xorq    %r10,%r10
1010         movq    8(%rbp),%r13
1011         movq    16(%rbp),%r14
1012         movq    24(%rbp),%r15
1013         jmp     .Lsqr4x_sub_entry
1014 .size   mul4x_internal,.-mul4x_internal
1015 .globl  bn_power5
1016 .type   bn_power5,@function
1017 .align  32
1018 bn_power5:
1019         movq    %rsp,%rax
1020         pushq   %rbx
1021         pushq   %rbp
1022         pushq   %r12
1023         pushq   %r13
1024         pushq   %r14
1025         pushq   %r15
1026
1027         shll    $3,%r9d
1028         leal    (%r9,%r9,2),%r10d
1029         negq    %r9
1030         movq    (%r8),%r8
1031
1032
1033
1034
1035
1036
1037
1038
1039         leaq    -320(%rsp,%r9,2),%r11
1040         subq    %rdi,%r11
1041         andq    $4095,%r11
1042         cmpq    %r11,%r10
1043         jb      .Lpwr_sp_alt
1044         subq    %r11,%rsp
1045         leaq    -320(%rsp,%r9,2),%rsp
1046         jmp     .Lpwr_sp_done
1047
1048 .align  32
1049 .Lpwr_sp_alt:
1050         leaq    4096-320(,%r9,2),%r10
1051         leaq    -320(%rsp,%r9,2),%rsp
1052         subq    %r10,%r11
1053         movq    $0,%r10
1054         cmovcq  %r10,%r11
1055         subq    %r11,%rsp
1056 .Lpwr_sp_done:
1057         andq    $-64,%rsp
1058         movq    %rax,%r11
1059         subq    %rsp,%r11
1060         andq    $-4096,%r11
1061 .Lpwr_page_walk:
1062         movq    (%rsp,%r11,1),%r10
1063         subq    $4096,%r11
1064 .byte   0x2e
1065         jnc     .Lpwr_page_walk
1066
1067         movq    %r9,%r10
1068         negq    %r9
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079         movq    %r8,32(%rsp)
1080         movq    %rax,40(%rsp)
1081 .Lpower5_body:
1082 .byte   102,72,15,110,207
1083 .byte   102,72,15,110,209
1084 .byte   102,73,15,110,218
1085 .byte   102,72,15,110,226
1086
1087         call    __bn_sqr8x_internal
1088         call    __bn_post4x_internal
1089         call    __bn_sqr8x_internal
1090         call    __bn_post4x_internal
1091         call    __bn_sqr8x_internal
1092         call    __bn_post4x_internal
1093         call    __bn_sqr8x_internal
1094         call    __bn_post4x_internal
1095         call    __bn_sqr8x_internal
1096         call    __bn_post4x_internal
1097
1098 .byte   102,72,15,126,209
1099 .byte   102,72,15,126,226
1100         movq    %rsi,%rdi
1101         movq    40(%rsp),%rax
1102         leaq    32(%rsp),%r8
1103
1104         call    mul4x_internal
1105
1106         movq    40(%rsp),%rsi
1107         movq    $1,%rax
1108         movq    -48(%rsi),%r15
1109         movq    -40(%rsi),%r14
1110         movq    -32(%rsi),%r13
1111         movq    -24(%rsi),%r12
1112         movq    -16(%rsi),%rbp
1113         movq    -8(%rsi),%rbx
1114         leaq    (%rsi),%rsp
1115 .Lpower5_epilogue:
1116         .byte   0xf3,0xc3
1117 .size   bn_power5,.-bn_power5
1118
1119 .globl  bn_sqr8x_internal
1120 .hidden bn_sqr8x_internal
1121 .type   bn_sqr8x_internal,@function
1122 .align  32
1123 bn_sqr8x_internal:
1124 __bn_sqr8x_internal:
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198         leaq    32(%r10),%rbp
1199         leaq    (%rsi,%r9,1),%rsi
1200
1201         movq    %r9,%rcx
1202
1203
1204         movq    -32(%rsi,%rbp,1),%r14
1205         leaq    48+8(%rsp,%r9,2),%rdi
1206         movq    -24(%rsi,%rbp,1),%rax
1207         leaq    -32(%rdi,%rbp,1),%rdi
1208         movq    -16(%rsi,%rbp,1),%rbx
1209         movq    %rax,%r15
1210
1211         mulq    %r14
1212         movq    %rax,%r10
1213         movq    %rbx,%rax
1214         movq    %rdx,%r11
1215         movq    %r10,-24(%rdi,%rbp,1)
1216
1217         mulq    %r14
1218         addq    %rax,%r11
1219         movq    %rbx,%rax
1220         adcq    $0,%rdx
1221         movq    %r11,-16(%rdi,%rbp,1)
1222         movq    %rdx,%r10
1223
1224
1225         movq    -8(%rsi,%rbp,1),%rbx
1226         mulq    %r15
1227         movq    %rax,%r12
1228         movq    %rbx,%rax
1229         movq    %rdx,%r13
1230
1231         leaq    (%rbp),%rcx
1232         mulq    %r14
1233         addq    %rax,%r10
1234         movq    %rbx,%rax
1235         movq    %rdx,%r11
1236         adcq    $0,%r11
1237         addq    %r12,%r10
1238         adcq    $0,%r11
1239         movq    %r10,-8(%rdi,%rcx,1)
1240         jmp     .Lsqr4x_1st
1241
1242 .align  32
1243 .Lsqr4x_1st:
1244         movq    (%rsi,%rcx,1),%rbx
1245         mulq    %r15
1246         addq    %rax,%r13
1247         movq    %rbx,%rax
1248         movq    %rdx,%r12
1249         adcq    $0,%r12
1250
1251         mulq    %r14
1252         addq    %rax,%r11
1253         movq    %rbx,%rax
1254         movq    8(%rsi,%rcx,1),%rbx
1255         movq    %rdx,%r10
1256         adcq    $0,%r10
1257         addq    %r13,%r11
1258         adcq    $0,%r10
1259
1260
1261         mulq    %r15
1262         addq    %rax,%r12
1263         movq    %rbx,%rax
1264         movq    %r11,(%rdi,%rcx,1)
1265         movq    %rdx,%r13
1266         adcq    $0,%r13
1267
1268         mulq    %r14
1269         addq    %rax,%r10
1270         movq    %rbx,%rax
1271         movq    16(%rsi,%rcx,1),%rbx
1272         movq    %rdx,%r11
1273         adcq    $0,%r11
1274         addq    %r12,%r10
1275         adcq    $0,%r11
1276
1277         mulq    %r15
1278         addq    %rax,%r13
1279         movq    %rbx,%rax
1280         movq    %r10,8(%rdi,%rcx,1)
1281         movq    %rdx,%r12
1282         adcq    $0,%r12
1283
1284         mulq    %r14
1285         addq    %rax,%r11
1286         movq    %rbx,%rax
1287         movq    24(%rsi,%rcx,1),%rbx
1288         movq    %rdx,%r10
1289         adcq    $0,%r10
1290         addq    %r13,%r11
1291         adcq    $0,%r10
1292
1293
1294         mulq    %r15
1295         addq    %rax,%r12
1296         movq    %rbx,%rax
1297         movq    %r11,16(%rdi,%rcx,1)
1298         movq    %rdx,%r13
1299         adcq    $0,%r13
1300         leaq    32(%rcx),%rcx
1301
1302         mulq    %r14
1303         addq    %rax,%r10
1304         movq    %rbx,%rax
1305         movq    %rdx,%r11
1306         adcq    $0,%r11
1307         addq    %r12,%r10
1308         adcq    $0,%r11
1309         movq    %r10,-8(%rdi,%rcx,1)
1310
1311         cmpq    $0,%rcx
1312         jne     .Lsqr4x_1st
1313
1314         mulq    %r15
1315         addq    %rax,%r13
1316         leaq    16(%rbp),%rbp
1317         adcq    $0,%rdx
1318         addq    %r11,%r13
1319         adcq    $0,%rdx
1320
1321         movq    %r13,(%rdi)
1322         movq    %rdx,%r12
1323         movq    %rdx,8(%rdi)
1324         jmp     .Lsqr4x_outer
1325
1326 .align  32
1327 .Lsqr4x_outer:
1328         movq    -32(%rsi,%rbp,1),%r14
1329         leaq    48+8(%rsp,%r9,2),%rdi
1330         movq    -24(%rsi,%rbp,1),%rax
1331         leaq    -32(%rdi,%rbp,1),%rdi
1332         movq    -16(%rsi,%rbp,1),%rbx
1333         movq    %rax,%r15
1334
1335         mulq    %r14
1336         movq    -24(%rdi,%rbp,1),%r10
1337         addq    %rax,%r10
1338         movq    %rbx,%rax
1339         adcq    $0,%rdx
1340         movq    %r10,-24(%rdi,%rbp,1)
1341         movq    %rdx,%r11
1342
1343         mulq    %r14
1344         addq    %rax,%r11
1345         movq    %rbx,%rax
1346         adcq    $0,%rdx
1347         addq    -16(%rdi,%rbp,1),%r11
1348         movq    %rdx,%r10
1349         adcq    $0,%r10
1350         movq    %r11,-16(%rdi,%rbp,1)
1351
1352         xorq    %r12,%r12
1353
1354         movq    -8(%rsi,%rbp,1),%rbx
1355         mulq    %r15
1356         addq    %rax,%r12
1357         movq    %rbx,%rax
1358         adcq    $0,%rdx
1359         addq    -8(%rdi,%rbp,1),%r12
1360         movq    %rdx,%r13
1361         adcq    $0,%r13
1362
1363         mulq    %r14
1364         addq    %rax,%r10
1365         movq    %rbx,%rax
1366         adcq    $0,%rdx
1367         addq    %r12,%r10
1368         movq    %rdx,%r11
1369         adcq    $0,%r11
1370         movq    %r10,-8(%rdi,%rbp,1)
1371
1372         leaq    (%rbp),%rcx
1373         jmp     .Lsqr4x_inner
1374
1375 .align  32
1376 .Lsqr4x_inner:
1377         movq    (%rsi,%rcx,1),%rbx
1378         mulq    %r15
1379         addq    %rax,%r13
1380         movq    %rbx,%rax
1381         movq    %rdx,%r12
1382         adcq    $0,%r12
1383         addq    (%rdi,%rcx,1),%r13
1384         adcq    $0,%r12
1385
1386 .byte   0x67
1387         mulq    %r14
1388         addq    %rax,%r11
1389         movq    %rbx,%rax
1390         movq    8(%rsi,%rcx,1),%rbx
1391         movq    %rdx,%r10
1392         adcq    $0,%r10
1393         addq    %r13,%r11
1394         adcq    $0,%r10
1395
1396         mulq    %r15
1397         addq    %rax,%r12
1398         movq    %r11,(%rdi,%rcx,1)
1399         movq    %rbx,%rax
1400         movq    %rdx,%r13
1401         adcq    $0,%r13
1402         addq    8(%rdi,%rcx,1),%r12
1403         leaq    16(%rcx),%rcx
1404         adcq    $0,%r13
1405
1406         mulq    %r14
1407         addq    %rax,%r10
1408         movq    %rbx,%rax
1409         adcq    $0,%rdx
1410         addq    %r12,%r10
1411         movq    %rdx,%r11
1412         adcq    $0,%r11
1413         movq    %r10,-8(%rdi,%rcx,1)
1414
1415         cmpq    $0,%rcx
1416         jne     .Lsqr4x_inner
1417
1418 .byte   0x67
1419         mulq    %r15
1420         addq    %rax,%r13
1421         adcq    $0,%rdx
1422         addq    %r11,%r13
1423         adcq    $0,%rdx
1424
1425         movq    %r13,(%rdi)
1426         movq    %rdx,%r12
1427         movq    %rdx,8(%rdi)
1428
1429         addq    $16,%rbp
1430         jnz     .Lsqr4x_outer
1431
1432
1433         movq    -32(%rsi),%r14
1434         leaq    48+8(%rsp,%r9,2),%rdi
1435         movq    -24(%rsi),%rax
1436         leaq    -32(%rdi,%rbp,1),%rdi
1437         movq    -16(%rsi),%rbx
1438         movq    %rax,%r15
1439
1440         mulq    %r14
1441         addq    %rax,%r10
1442         movq    %rbx,%rax
1443         movq    %rdx,%r11
1444         adcq    $0,%r11
1445
1446         mulq    %r14
1447         addq    %rax,%r11
1448         movq    %rbx,%rax
1449         movq    %r10,-24(%rdi)
1450         movq    %rdx,%r10
1451         adcq    $0,%r10
1452         addq    %r13,%r11
1453         movq    -8(%rsi),%rbx
1454         adcq    $0,%r10
1455
1456         mulq    %r15
1457         addq    %rax,%r12
1458         movq    %rbx,%rax
1459         movq    %r11,-16(%rdi)
1460         movq    %rdx,%r13
1461         adcq    $0,%r13
1462
1463         mulq    %r14
1464         addq    %rax,%r10
1465         movq    %rbx,%rax
1466         movq    %rdx,%r11
1467         adcq    $0,%r11
1468         addq    %r12,%r10
1469         adcq    $0,%r11
1470         movq    %r10,-8(%rdi)
1471
1472         mulq    %r15
1473         addq    %rax,%r13
1474         movq    -16(%rsi),%rax
1475         adcq    $0,%rdx
1476         addq    %r11,%r13
1477         adcq    $0,%rdx
1478
1479         movq    %r13,(%rdi)
1480         movq    %rdx,%r12
1481         movq    %rdx,8(%rdi)
1482
1483         mulq    %rbx
1484         addq    $16,%rbp
1485         xorq    %r14,%r14
1486         subq    %r9,%rbp
1487         xorq    %r15,%r15
1488
1489         addq    %r12,%rax
1490         adcq    $0,%rdx
1491         movq    %rax,8(%rdi)
1492         movq    %rdx,16(%rdi)
1493         movq    %r15,24(%rdi)
1494
1495         movq    -16(%rsi,%rbp,1),%rax
1496         leaq    48+8(%rsp),%rdi
1497         xorq    %r10,%r10
1498         movq    8(%rdi),%r11
1499
1500         leaq    (%r14,%r10,2),%r12
1501         shrq    $63,%r10
1502         leaq    (%rcx,%r11,2),%r13
1503         shrq    $63,%r11
1504         orq     %r10,%r13
1505         movq    16(%rdi),%r10
1506         movq    %r11,%r14
1507         mulq    %rax
1508         negq    %r15
1509         movq    24(%rdi),%r11
1510         adcq    %rax,%r12
1511         movq    -8(%rsi,%rbp,1),%rax
1512         movq    %r12,(%rdi)
1513         adcq    %rdx,%r13
1514
1515         leaq    (%r14,%r10,2),%rbx
1516         movq    %r13,8(%rdi)
1517         sbbq    %r15,%r15
1518         shrq    $63,%r10
1519         leaq    (%rcx,%r11,2),%r8
1520         shrq    $63,%r11
1521         orq     %r10,%r8
1522         movq    32(%rdi),%r10
1523         movq    %r11,%r14
1524         mulq    %rax
1525         negq    %r15
1526         movq    40(%rdi),%r11
1527         adcq    %rax,%rbx
1528         movq    0(%rsi,%rbp,1),%rax
1529         movq    %rbx,16(%rdi)
1530         adcq    %rdx,%r8
1531         leaq    16(%rbp),%rbp
1532         movq    %r8,24(%rdi)
1533         sbbq    %r15,%r15
1534         leaq    64(%rdi),%rdi
1535         jmp     .Lsqr4x_shift_n_add
1536
1537 .align  32
1538 .Lsqr4x_shift_n_add:
1539         leaq    (%r14,%r10,2),%r12
1540         shrq    $63,%r10
1541         leaq    (%rcx,%r11,2),%r13
1542         shrq    $63,%r11
1543         orq     %r10,%r13
1544         movq    -16(%rdi),%r10
1545         movq    %r11,%r14
1546         mulq    %rax
1547         negq    %r15
1548         movq    -8(%rdi),%r11
1549         adcq    %rax,%r12
1550         movq    -8(%rsi,%rbp,1),%rax
1551         movq    %r12,-32(%rdi)
1552         adcq    %rdx,%r13
1553
1554         leaq    (%r14,%r10,2),%rbx
1555         movq    %r13,-24(%rdi)
1556         sbbq    %r15,%r15
1557         shrq    $63,%r10
1558         leaq    (%rcx,%r11,2),%r8
1559         shrq    $63,%r11
1560         orq     %r10,%r8
1561         movq    0(%rdi),%r10
1562         movq    %r11,%r14
1563         mulq    %rax
1564         negq    %r15
1565         movq    8(%rdi),%r11
1566         adcq    %rax,%rbx
1567         movq    0(%rsi,%rbp,1),%rax
1568         movq    %rbx,-16(%rdi)
1569         adcq    %rdx,%r8
1570
1571         leaq    (%r14,%r10,2),%r12
1572         movq    %r8,-8(%rdi)
1573         sbbq    %r15,%r15
1574         shrq    $63,%r10
1575         leaq    (%rcx,%r11,2),%r13
1576         shrq    $63,%r11
1577         orq     %r10,%r13
1578         movq    16(%rdi),%r10
1579         movq    %r11,%r14
1580         mulq    %rax
1581         negq    %r15
1582         movq    24(%rdi),%r11
1583         adcq    %rax,%r12
1584         movq    8(%rsi,%rbp,1),%rax
1585         movq    %r12,0(%rdi)
1586         adcq    %rdx,%r13
1587
1588         leaq    (%r14,%r10,2),%rbx
1589         movq    %r13,8(%rdi)
1590         sbbq    %r15,%r15
1591         shrq    $63,%r10
1592         leaq    (%rcx,%r11,2),%r8
1593         shrq    $63,%r11
1594         orq     %r10,%r8
1595         movq    32(%rdi),%r10
1596         movq    %r11,%r14
1597         mulq    %rax
1598         negq    %r15
1599         movq    40(%rdi),%r11
1600         adcq    %rax,%rbx
1601         movq    16(%rsi,%rbp,1),%rax
1602         movq    %rbx,16(%rdi)
1603         adcq    %rdx,%r8
1604         movq    %r8,24(%rdi)
1605         sbbq    %r15,%r15
1606         leaq    64(%rdi),%rdi
1607         addq    $32,%rbp
1608         jnz     .Lsqr4x_shift_n_add
1609
1610         leaq    (%r14,%r10,2),%r12
1611 .byte   0x67
1612         shrq    $63,%r10
1613         leaq    (%rcx,%r11,2),%r13
1614         shrq    $63,%r11
1615         orq     %r10,%r13
1616         movq    -16(%rdi),%r10
1617         movq    %r11,%r14
1618         mulq    %rax
1619         negq    %r15
1620         movq    -8(%rdi),%r11
1621         adcq    %rax,%r12
1622         movq    -8(%rsi),%rax
1623         movq    %r12,-32(%rdi)
1624         adcq    %rdx,%r13
1625
1626         leaq    (%r14,%r10,2),%rbx
1627         movq    %r13,-24(%rdi)
1628         sbbq    %r15,%r15
1629         shrq    $63,%r10
1630         leaq    (%rcx,%r11,2),%r8
1631         shrq    $63,%r11
1632         orq     %r10,%r8
1633         mulq    %rax
1634         negq    %r15
1635         adcq    %rax,%rbx
1636         adcq    %rdx,%r8
1637         movq    %rbx,-16(%rdi)
1638         movq    %r8,-8(%rdi)
1639 .byte   102,72,15,126,213
1640 __bn_sqr8x_reduction:
1641         xorq    %rax,%rax
1642         leaq    (%r9,%rbp,1),%rcx
1643         leaq    48+8(%rsp,%r9,2),%rdx
1644         movq    %rcx,0+8(%rsp)
1645         leaq    48+8(%rsp,%r9,1),%rdi
1646         movq    %rdx,8+8(%rsp)
1647         negq    %r9
1648         jmp     .L8x_reduction_loop
1649
1650 .align  32
1651 .L8x_reduction_loop:
1652         leaq    (%rdi,%r9,1),%rdi
1653 .byte   0x66
1654         movq    0(%rdi),%rbx
1655         movq    8(%rdi),%r9
1656         movq    16(%rdi),%r10
1657         movq    24(%rdi),%r11
1658         movq    32(%rdi),%r12
1659         movq    40(%rdi),%r13
1660         movq    48(%rdi),%r14
1661         movq    56(%rdi),%r15
1662         movq    %rax,(%rdx)
1663         leaq    64(%rdi),%rdi
1664
1665 .byte   0x67
1666         movq    %rbx,%r8
1667         imulq   32+8(%rsp),%rbx
1668         movq    0(%rbp),%rax
1669         movl    $8,%ecx
1670         jmp     .L8x_reduce
1671
1672 .align  32
1673 .L8x_reduce:
1674         mulq    %rbx
1675         movq    8(%rbp),%rax
1676         negq    %r8
1677         movq    %rdx,%r8
1678         adcq    $0,%r8
1679
1680         mulq    %rbx
1681         addq    %rax,%r9
1682         movq    16(%rbp),%rax
1683         adcq    $0,%rdx
1684         addq    %r9,%r8
1685         movq    %rbx,48-8+8(%rsp,%rcx,8)
1686         movq    %rdx,%r9
1687         adcq    $0,%r9
1688
1689         mulq    %rbx
1690         addq    %rax,%r10
1691         movq    24(%rbp),%rax
1692         adcq    $0,%rdx
1693         addq    %r10,%r9
1694         movq    32+8(%rsp),%rsi
1695         movq    %rdx,%r10
1696         adcq    $0,%r10
1697
1698         mulq    %rbx
1699         addq    %rax,%r11
1700         movq    32(%rbp),%rax
1701         adcq    $0,%rdx
1702         imulq   %r8,%rsi
1703         addq    %r11,%r10
1704         movq    %rdx,%r11
1705         adcq    $0,%r11
1706
1707         mulq    %rbx
1708         addq    %rax,%r12
1709         movq    40(%rbp),%rax
1710         adcq    $0,%rdx
1711         addq    %r12,%r11
1712         movq    %rdx,%r12
1713         adcq    $0,%r12
1714
1715         mulq    %rbx
1716         addq    %rax,%r13
1717         movq    48(%rbp),%rax
1718         adcq    $0,%rdx
1719         addq    %r13,%r12
1720         movq    %rdx,%r13
1721         adcq    $0,%r13
1722
1723         mulq    %rbx
1724         addq    %rax,%r14
1725         movq    56(%rbp),%rax
1726         adcq    $0,%rdx
1727         addq    %r14,%r13
1728         movq    %rdx,%r14
1729         adcq    $0,%r14
1730
1731         mulq    %rbx
1732         movq    %rsi,%rbx
1733         addq    %rax,%r15
1734         movq    0(%rbp),%rax
1735         adcq    $0,%rdx
1736         addq    %r15,%r14
1737         movq    %rdx,%r15
1738         adcq    $0,%r15
1739
1740         decl    %ecx
1741         jnz     .L8x_reduce
1742
1743         leaq    64(%rbp),%rbp
1744         xorq    %rax,%rax
1745         movq    8+8(%rsp),%rdx
1746         cmpq    0+8(%rsp),%rbp
1747         jae     .L8x_no_tail
1748
1749 .byte   0x66
1750         addq    0(%rdi),%r8
1751         adcq    8(%rdi),%r9
1752         adcq    16(%rdi),%r10
1753         adcq    24(%rdi),%r11
1754         adcq    32(%rdi),%r12
1755         adcq    40(%rdi),%r13
1756         adcq    48(%rdi),%r14
1757         adcq    56(%rdi),%r15
1758         sbbq    %rsi,%rsi
1759
1760         movq    48+56+8(%rsp),%rbx
1761         movl    $8,%ecx
1762         movq    0(%rbp),%rax
1763         jmp     .L8x_tail
1764
1765 .align  32
1766 .L8x_tail:
1767         mulq    %rbx
1768         addq    %rax,%r8
1769         movq    8(%rbp),%rax
1770         movq    %r8,(%rdi)
1771         movq    %rdx,%r8
1772         adcq    $0,%r8
1773
1774         mulq    %rbx
1775         addq    %rax,%r9
1776         movq    16(%rbp),%rax
1777         adcq    $0,%rdx
1778         addq    %r9,%r8
1779         leaq    8(%rdi),%rdi
1780         movq    %rdx,%r9
1781         adcq    $0,%r9
1782
1783         mulq    %rbx
1784         addq    %rax,%r10
1785         movq    24(%rbp),%rax
1786         adcq    $0,%rdx
1787         addq    %r10,%r9
1788         movq    %rdx,%r10
1789         adcq    $0,%r10
1790
1791         mulq    %rbx
1792         addq    %rax,%r11
1793         movq    32(%rbp),%rax
1794         adcq    $0,%rdx
1795         addq    %r11,%r10
1796         movq    %rdx,%r11
1797         adcq    $0,%r11
1798
1799         mulq    %rbx
1800         addq    %rax,%r12
1801         movq    40(%rbp),%rax
1802         adcq    $0,%rdx
1803         addq    %r12,%r11
1804         movq    %rdx,%r12
1805         adcq    $0,%r12
1806
1807         mulq    %rbx
1808         addq    %rax,%r13
1809         movq    48(%rbp),%rax
1810         adcq    $0,%rdx
1811         addq    %r13,%r12
1812         movq    %rdx,%r13
1813         adcq    $0,%r13
1814
1815         mulq    %rbx
1816         addq    %rax,%r14
1817         movq    56(%rbp),%rax
1818         adcq    $0,%rdx
1819         addq    %r14,%r13
1820         movq    %rdx,%r14
1821         adcq    $0,%r14
1822
1823         mulq    %rbx
1824         movq    48-16+8(%rsp,%rcx,8),%rbx
1825         addq    %rax,%r15
1826         adcq    $0,%rdx
1827         addq    %r15,%r14
1828         movq    0(%rbp),%rax
1829         movq    %rdx,%r15
1830         adcq    $0,%r15
1831
1832         decl    %ecx
1833         jnz     .L8x_tail
1834
1835         leaq    64(%rbp),%rbp
1836         movq    8+8(%rsp),%rdx
1837         cmpq    0+8(%rsp),%rbp
1838         jae     .L8x_tail_done
1839
1840         movq    48+56+8(%rsp),%rbx
1841         negq    %rsi
1842         movq    0(%rbp),%rax
1843         adcq    0(%rdi),%r8
1844         adcq    8(%rdi),%r9
1845         adcq    16(%rdi),%r10
1846         adcq    24(%rdi),%r11
1847         adcq    32(%rdi),%r12
1848         adcq    40(%rdi),%r13
1849         adcq    48(%rdi),%r14
1850         adcq    56(%rdi),%r15
1851         sbbq    %rsi,%rsi
1852
1853         movl    $8,%ecx
1854         jmp     .L8x_tail
1855
1856 .align  32
1857 .L8x_tail_done:
1858         addq    (%rdx),%r8
1859         adcq    $0,%r9
1860         adcq    $0,%r10
1861         adcq    $0,%r11
1862         adcq    $0,%r12
1863         adcq    $0,%r13
1864         adcq    $0,%r14
1865         adcq    $0,%r15
1866
1867
1868         xorq    %rax,%rax
1869
1870         negq    %rsi
1871 .L8x_no_tail:
1872         adcq    0(%rdi),%r8
1873         adcq    8(%rdi),%r9
1874         adcq    16(%rdi),%r10
1875         adcq    24(%rdi),%r11
1876         adcq    32(%rdi),%r12
1877         adcq    40(%rdi),%r13
1878         adcq    48(%rdi),%r14
1879         adcq    56(%rdi),%r15
1880         adcq    $0,%rax
1881         movq    -8(%rbp),%rcx
1882         xorq    %rsi,%rsi
1883
1884 .byte   102,72,15,126,213
1885
1886         movq    %r8,0(%rdi)
1887         movq    %r9,8(%rdi)
1888 .byte   102,73,15,126,217
1889         movq    %r10,16(%rdi)
1890         movq    %r11,24(%rdi)
1891         movq    %r12,32(%rdi)
1892         movq    %r13,40(%rdi)
1893         movq    %r14,48(%rdi)
1894         movq    %r15,56(%rdi)
1895         leaq    64(%rdi),%rdi
1896
1897         cmpq    %rdx,%rdi
1898         jb      .L8x_reduction_loop
1899         .byte   0xf3,0xc3
1900 .size   bn_sqr8x_internal,.-bn_sqr8x_internal
1901 .type   __bn_post4x_internal,@function
1902 .align  32
1903 __bn_post4x_internal:
1904         movq    0(%rbp),%r12
1905         leaq    (%rdi,%r9,1),%rbx
1906         movq    %r9,%rcx
1907 .byte   102,72,15,126,207
1908         negq    %rax
1909 .byte   102,72,15,126,206
1910         sarq    $3+2,%rcx
1911         decq    %r12
1912         xorq    %r10,%r10
1913         movq    8(%rbp),%r13
1914         movq    16(%rbp),%r14
1915         movq    24(%rbp),%r15
1916         jmp     .Lsqr4x_sub_entry
1917
1918 .align  16
1919 .Lsqr4x_sub:
1920         movq    0(%rbp),%r12
1921         movq    8(%rbp),%r13
1922         movq    16(%rbp),%r14
1923         movq    24(%rbp),%r15
1924 .Lsqr4x_sub_entry:
1925         leaq    32(%rbp),%rbp
1926         notq    %r12
1927         notq    %r13
1928         notq    %r14
1929         notq    %r15
1930         andq    %rax,%r12
1931         andq    %rax,%r13
1932         andq    %rax,%r14
1933         andq    %rax,%r15
1934
1935         negq    %r10
1936         adcq    0(%rbx),%r12
1937         adcq    8(%rbx),%r13
1938         adcq    16(%rbx),%r14
1939         adcq    24(%rbx),%r15
1940         movq    %r12,0(%rdi)
1941         leaq    32(%rbx),%rbx
1942         movq    %r13,8(%rdi)
1943         sbbq    %r10,%r10
1944         movq    %r14,16(%rdi)
1945         movq    %r15,24(%rdi)
1946         leaq    32(%rdi),%rdi
1947
1948         incq    %rcx
1949         jnz     .Lsqr4x_sub
1950
1951         movq    %r9,%r10
1952         negq    %r9
1953         .byte   0xf3,0xc3
1954 .size   __bn_post4x_internal,.-__bn_post4x_internal
1955 .globl  bn_from_montgomery
1956 .type   bn_from_montgomery,@function
1957 .align  32
1958 bn_from_montgomery:
1959         testl   $7,%r9d
1960         jz      bn_from_mont8x
1961         xorl    %eax,%eax
1962         .byte   0xf3,0xc3
1963 .size   bn_from_montgomery,.-bn_from_montgomery
1964
1965 .type   bn_from_mont8x,@function
1966 .align  32
1967 bn_from_mont8x:
1968 .byte   0x67
1969         movq    %rsp,%rax
1970         pushq   %rbx
1971         pushq   %rbp
1972         pushq   %r12
1973         pushq   %r13
1974         pushq   %r14
1975         pushq   %r15
1976
1977         shll    $3,%r9d
1978         leaq    (%r9,%r9,2),%r10
1979         negq    %r9
1980         movq    (%r8),%r8
1981
1982
1983
1984
1985
1986
1987
1988
1989         leaq    -320(%rsp,%r9,2),%r11
1990         subq    %rdi,%r11
1991         andq    $4095,%r11
1992         cmpq    %r11,%r10
1993         jb      .Lfrom_sp_alt
1994         subq    %r11,%rsp
1995         leaq    -320(%rsp,%r9,2),%rsp
1996         jmp     .Lfrom_sp_done
1997
1998 .align  32
1999 .Lfrom_sp_alt:
2000         leaq    4096-320(,%r9,2),%r10
2001         leaq    -320(%rsp,%r9,2),%rsp
2002         subq    %r10,%r11
2003         movq    $0,%r10
2004         cmovcq  %r10,%r11
2005         subq    %r11,%rsp
2006 .Lfrom_sp_done:
2007         andq    $-64,%rsp
2008         movq    %rax,%r11
2009         subq    %rsp,%r11
2010         andq    $-4096,%r11
2011 .Lfrom_page_walk:
2012         movq    (%rsp,%r11,1),%r10
2013         subq    $4096,%r11
2014 .byte   0x2e
2015         jnc     .Lfrom_page_walk
2016
2017         movq    %r9,%r10
2018         negq    %r9
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029         movq    %r8,32(%rsp)
2030         movq    %rax,40(%rsp)
2031 .Lfrom_body:
2032         movq    %r9,%r11
2033         leaq    48(%rsp),%rax
2034         pxor    %xmm0,%xmm0
2035         jmp     .Lmul_by_1
2036
2037 .align  32
2038 .Lmul_by_1:
2039         movdqu  (%rsi),%xmm1
2040         movdqu  16(%rsi),%xmm2
2041         movdqu  32(%rsi),%xmm3
2042         movdqa  %xmm0,(%rax,%r9,1)
2043         movdqu  48(%rsi),%xmm4
2044         movdqa  %xmm0,16(%rax,%r9,1)
2045 .byte   0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2046         movdqa  %xmm1,(%rax)
2047         movdqa  %xmm0,32(%rax,%r9,1)
2048         movdqa  %xmm2,16(%rax)
2049         movdqa  %xmm0,48(%rax,%r9,1)
2050         movdqa  %xmm3,32(%rax)
2051         movdqa  %xmm4,48(%rax)
2052         leaq    64(%rax),%rax
2053         subq    $64,%r11
2054         jnz     .Lmul_by_1
2055
2056 .byte   102,72,15,110,207
2057 .byte   102,72,15,110,209
2058 .byte   0x67
2059         movq    %rcx,%rbp
2060 .byte   102,73,15,110,218
2061         call    __bn_sqr8x_reduction
2062         call    __bn_post4x_internal
2063
2064         pxor    %xmm0,%xmm0
2065         leaq    48(%rsp),%rax
2066         movq    40(%rsp),%rsi
2067         jmp     .Lfrom_mont_zero
2068
2069 .align  32
2070 .Lfrom_mont_zero:
2071         movdqa  %xmm0,0(%rax)
2072         movdqa  %xmm0,16(%rax)
2073         movdqa  %xmm0,32(%rax)
2074         movdqa  %xmm0,48(%rax)
2075         leaq    64(%rax),%rax
2076         subq    $32,%r9
2077         jnz     .Lfrom_mont_zero
2078
2079         movq    $1,%rax
2080         movq    -48(%rsi),%r15
2081         movq    -40(%rsi),%r14
2082         movq    -32(%rsi),%r13
2083         movq    -24(%rsi),%r12
2084         movq    -16(%rsi),%rbp
2085         movq    -8(%rsi),%rbx
2086         leaq    (%rsi),%rsp
2087 .Lfrom_epilogue:
2088         .byte   0xf3,0xc3
2089 .size   bn_from_mont8x,.-bn_from_mont8x
2090 .globl  bn_get_bits5
2091 .type   bn_get_bits5,@function
2092 .align  16
2093 bn_get_bits5:
2094         leaq    0(%rdi),%r10
2095         leaq    1(%rdi),%r11
2096         movl    %esi,%ecx
2097         shrl    $4,%esi
2098         andl    $15,%ecx
2099         leal    -8(%rcx),%eax
2100         cmpl    $11,%ecx
2101         cmovaq  %r11,%r10
2102         cmoval  %eax,%ecx
2103         movzwl  (%r10,%rsi,2),%eax
2104         shrl    %cl,%eax
2105         andl    $31,%eax
2106         .byte   0xf3,0xc3
2107 .size   bn_get_bits5,.-bn_get_bits5
2108
2109 .globl  bn_scatter5
2110 .type   bn_scatter5,@function
2111 .align  16
2112 bn_scatter5:
2113         cmpl    $0,%esi
2114         jz      .Lscatter_epilogue
2115         leaq    (%rdx,%rcx,8),%rdx
2116 .Lscatter:
2117         movq    (%rdi),%rax
2118         leaq    8(%rdi),%rdi
2119         movq    %rax,(%rdx)
2120         leaq    256(%rdx),%rdx
2121         subl    $1,%esi
2122         jnz     .Lscatter
2123 .Lscatter_epilogue:
2124         .byte   0xf3,0xc3
2125 .size   bn_scatter5,.-bn_scatter5
2126
2127 .globl  bn_gather5
2128 .type   bn_gather5,@function
2129 .align  32
2130 bn_gather5:
2131 .LSEH_begin_bn_gather5:
2132
2133 .byte   0x4c,0x8d,0x14,0x24
2134 .byte   0x48,0x81,0xec,0x08,0x01,0x00,0x00
2135         leaq    .Linc(%rip),%rax
2136         andq    $-16,%rsp
2137
2138         movd    %ecx,%xmm5
2139         movdqa  0(%rax),%xmm0
2140         movdqa  16(%rax),%xmm1
2141         leaq    128(%rdx),%r11
2142         leaq    128(%rsp),%rax
2143
2144         pshufd  $0,%xmm5,%xmm5
2145         movdqa  %xmm1,%xmm4
2146         movdqa  %xmm1,%xmm2
2147         paddd   %xmm0,%xmm1
2148         pcmpeqd %xmm5,%xmm0
2149         movdqa  %xmm4,%xmm3
2150
2151         paddd   %xmm1,%xmm2
2152         pcmpeqd %xmm5,%xmm1
2153         movdqa  %xmm0,-128(%rax)
2154         movdqa  %xmm4,%xmm0
2155
2156         paddd   %xmm2,%xmm3
2157         pcmpeqd %xmm5,%xmm2
2158         movdqa  %xmm1,-112(%rax)
2159         movdqa  %xmm4,%xmm1
2160
2161         paddd   %xmm3,%xmm0
2162         pcmpeqd %xmm5,%xmm3
2163         movdqa  %xmm2,-96(%rax)
2164         movdqa  %xmm4,%xmm2
2165         paddd   %xmm0,%xmm1
2166         pcmpeqd %xmm5,%xmm0
2167         movdqa  %xmm3,-80(%rax)
2168         movdqa  %xmm4,%xmm3
2169
2170         paddd   %xmm1,%xmm2
2171         pcmpeqd %xmm5,%xmm1
2172         movdqa  %xmm0,-64(%rax)
2173         movdqa  %xmm4,%xmm0
2174
2175         paddd   %xmm2,%xmm3
2176         pcmpeqd %xmm5,%xmm2
2177         movdqa  %xmm1,-48(%rax)
2178         movdqa  %xmm4,%xmm1
2179
2180         paddd   %xmm3,%xmm0
2181         pcmpeqd %xmm5,%xmm3
2182         movdqa  %xmm2,-32(%rax)
2183         movdqa  %xmm4,%xmm2
2184         paddd   %xmm0,%xmm1
2185         pcmpeqd %xmm5,%xmm0
2186         movdqa  %xmm3,-16(%rax)
2187         movdqa  %xmm4,%xmm3
2188
2189         paddd   %xmm1,%xmm2
2190         pcmpeqd %xmm5,%xmm1
2191         movdqa  %xmm0,0(%rax)
2192         movdqa  %xmm4,%xmm0
2193
2194         paddd   %xmm2,%xmm3
2195         pcmpeqd %xmm5,%xmm2
2196         movdqa  %xmm1,16(%rax)
2197         movdqa  %xmm4,%xmm1
2198
2199         paddd   %xmm3,%xmm0
2200         pcmpeqd %xmm5,%xmm3
2201         movdqa  %xmm2,32(%rax)
2202         movdqa  %xmm4,%xmm2
2203         paddd   %xmm0,%xmm1
2204         pcmpeqd %xmm5,%xmm0
2205         movdqa  %xmm3,48(%rax)
2206         movdqa  %xmm4,%xmm3
2207
2208         paddd   %xmm1,%xmm2
2209         pcmpeqd %xmm5,%xmm1
2210         movdqa  %xmm0,64(%rax)
2211         movdqa  %xmm4,%xmm0
2212
2213         paddd   %xmm2,%xmm3
2214         pcmpeqd %xmm5,%xmm2
2215         movdqa  %xmm1,80(%rax)
2216         movdqa  %xmm4,%xmm1
2217
2218         paddd   %xmm3,%xmm0
2219         pcmpeqd %xmm5,%xmm3
2220         movdqa  %xmm2,96(%rax)
2221         movdqa  %xmm4,%xmm2
2222         movdqa  %xmm3,112(%rax)
2223         jmp     .Lgather
2224
2225 .align  32
2226 .Lgather:
2227         pxor    %xmm4,%xmm4
2228         pxor    %xmm5,%xmm5
2229         movdqa  -128(%r11),%xmm0
2230         movdqa  -112(%r11),%xmm1
2231         movdqa  -96(%r11),%xmm2
2232         pand    -128(%rax),%xmm0
2233         movdqa  -80(%r11),%xmm3
2234         pand    -112(%rax),%xmm1
2235         por     %xmm0,%xmm4
2236         pand    -96(%rax),%xmm2
2237         por     %xmm1,%xmm5
2238         pand    -80(%rax),%xmm3
2239         por     %xmm2,%xmm4
2240         por     %xmm3,%xmm5
2241         movdqa  -64(%r11),%xmm0
2242         movdqa  -48(%r11),%xmm1
2243         movdqa  -32(%r11),%xmm2
2244         pand    -64(%rax),%xmm0
2245         movdqa  -16(%r11),%xmm3
2246         pand    -48(%rax),%xmm1
2247         por     %xmm0,%xmm4
2248         pand    -32(%rax),%xmm2
2249         por     %xmm1,%xmm5
2250         pand    -16(%rax),%xmm3
2251         por     %xmm2,%xmm4
2252         por     %xmm3,%xmm5
2253         movdqa  0(%r11),%xmm0
2254         movdqa  16(%r11),%xmm1
2255         movdqa  32(%r11),%xmm2
2256         pand    0(%rax),%xmm0
2257         movdqa  48(%r11),%xmm3
2258         pand    16(%rax),%xmm1
2259         por     %xmm0,%xmm4
2260         pand    32(%rax),%xmm2
2261         por     %xmm1,%xmm5
2262         pand    48(%rax),%xmm3
2263         por     %xmm2,%xmm4
2264         por     %xmm3,%xmm5
2265         movdqa  64(%r11),%xmm0
2266         movdqa  80(%r11),%xmm1
2267         movdqa  96(%r11),%xmm2
2268         pand    64(%rax),%xmm0
2269         movdqa  112(%r11),%xmm3
2270         pand    80(%rax),%xmm1
2271         por     %xmm0,%xmm4
2272         pand    96(%rax),%xmm2
2273         por     %xmm1,%xmm5
2274         pand    112(%rax),%xmm3
2275         por     %xmm2,%xmm4
2276         por     %xmm3,%xmm5
2277         por     %xmm5,%xmm4
2278         leaq    256(%r11),%r11
2279         pshufd  $0x4e,%xmm4,%xmm0
2280         por     %xmm4,%xmm0
2281         movq    %xmm0,(%rdi)
2282         leaq    8(%rdi),%rdi
2283         subl    $1,%esi
2284         jnz     .Lgather
2285
2286         leaq    (%r10),%rsp
2287         .byte   0xf3,0xc3
2288 .LSEH_end_bn_gather5:
2289 .size   bn_gather5,.-bn_gather5
2290 .align  64
2291 .Linc:
2292 .long   0,0, 1,1
2293 .long   2,2, 2,2
2294 .byte   77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0