]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - secure/lib/libcrypto/amd64/x86_64-mont5.S
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / secure / lib / libcrypto / amd64 / x86_64-mont5.S
1         # $FreeBSD$
2 .text   
3
4 .globl  bn_mul_mont_gather5
5 .type   bn_mul_mont_gather5,@function
6 .align  64
7 bn_mul_mont_gather5:
8         testl   $3,%r9d
9         jnz     .Lmul_enter
10         cmpl    $8,%r9d
11         jb      .Lmul_enter
12         jmp     .Lmul4x_enter
13
14 .align  16
15 .Lmul_enter:
16         movl    %r9d,%r9d
17         movl    8(%rsp),%r10d
18         pushq   %rbx
19         pushq   %rbp
20         pushq   %r12
21         pushq   %r13
22         pushq   %r14
23         pushq   %r15
24         movq    %rsp,%rax
25         leaq    2(%r9),%r11
26         negq    %r11
27         leaq    (%rsp,%r11,8),%rsp
28         andq    $-1024,%rsp
29
30         movq    %rax,8(%rsp,%r9,8)
31 .Lmul_body:
32         movq    %rdx,%r12
33         movq    %r10,%r11
34         shrq    $3,%r10
35         andq    $7,%r11
36         notq    %r10
37         leaq    .Lmagic_masks(%rip),%rax
38         andq    $3,%r10
39         leaq    96(%r12,%r11,8),%r12
40         movq    0(%rax,%r10,8),%xmm4
41         movq    8(%rax,%r10,8),%xmm5
42         movq    16(%rax,%r10,8),%xmm6
43         movq    24(%rax,%r10,8),%xmm7
44
45         movq    -96(%r12),%xmm0
46         movq    -32(%r12),%xmm1
47         pand    %xmm4,%xmm0
48         movq    32(%r12),%xmm2
49         pand    %xmm5,%xmm1
50         movq    96(%r12),%xmm3
51         pand    %xmm6,%xmm2
52         por     %xmm1,%xmm0
53         pand    %xmm7,%xmm3
54         por     %xmm2,%xmm0
55         leaq    256(%r12),%r12
56         por     %xmm3,%xmm0
57
58 .byte   102,72,15,126,195
59
60         movq    (%r8),%r8
61         movq    (%rsi),%rax
62
63         xorq    %r14,%r14
64         xorq    %r15,%r15
65
66         movq    -96(%r12),%xmm0
67         movq    -32(%r12),%xmm1
68         pand    %xmm4,%xmm0
69         movq    32(%r12),%xmm2
70         pand    %xmm5,%xmm1
71
72         movq    %r8,%rbp
73         mulq    %rbx
74         movq    %rax,%r10
75         movq    (%rcx),%rax
76
77         movq    96(%r12),%xmm3
78         pand    %xmm6,%xmm2
79         por     %xmm1,%xmm0
80         pand    %xmm7,%xmm3
81
82         imulq   %r10,%rbp
83         movq    %rdx,%r11
84
85         por     %xmm2,%xmm0
86         leaq    256(%r12),%r12
87         por     %xmm3,%xmm0
88
89         mulq    %rbp
90         addq    %rax,%r10
91         movq    8(%rsi),%rax
92         adcq    $0,%rdx
93         movq    %rdx,%r13
94
95         leaq    1(%r15),%r15
96         jmp     .L1st_enter
97
98 .align  16
99 .L1st:
100         addq    %rax,%r13
101         movq    (%rsi,%r15,8),%rax
102         adcq    $0,%rdx
103         addq    %r11,%r13
104         movq    %r10,%r11
105         adcq    $0,%rdx
106         movq    %r13,-16(%rsp,%r15,8)
107         movq    %rdx,%r13
108
109 .L1st_enter:
110         mulq    %rbx
111         addq    %rax,%r11
112         movq    (%rcx,%r15,8),%rax
113         adcq    $0,%rdx
114         leaq    1(%r15),%r15
115         movq    %rdx,%r10
116
117         mulq    %rbp
118         cmpq    %r9,%r15
119         jne     .L1st
120
121 .byte   102,72,15,126,195
122
123         addq    %rax,%r13
124         movq    (%rsi),%rax
125         adcq    $0,%rdx
126         addq    %r11,%r13
127         adcq    $0,%rdx
128         movq    %r13,-16(%rsp,%r15,8)
129         movq    %rdx,%r13
130         movq    %r10,%r11
131
132         xorq    %rdx,%rdx
133         addq    %r11,%r13
134         adcq    $0,%rdx
135         movq    %r13,-8(%rsp,%r9,8)
136         movq    %rdx,(%rsp,%r9,8)
137
138         leaq    1(%r14),%r14
139         jmp     .Louter
140 .align  16
141 .Louter:
142         xorq    %r15,%r15
143         movq    %r8,%rbp
144         movq    (%rsp),%r10
145
146         movq    -96(%r12),%xmm0
147         movq    -32(%r12),%xmm1
148         pand    %xmm4,%xmm0
149         movq    32(%r12),%xmm2
150         pand    %xmm5,%xmm1
151
152         mulq    %rbx
153         addq    %rax,%r10
154         movq    (%rcx),%rax
155         adcq    $0,%rdx
156
157         movq    96(%r12),%xmm3
158         pand    %xmm6,%xmm2
159         por     %xmm1,%xmm0
160         pand    %xmm7,%xmm3
161
162         imulq   %r10,%rbp
163         movq    %rdx,%r11
164
165         por     %xmm2,%xmm0
166         leaq    256(%r12),%r12
167         por     %xmm3,%xmm0
168
169         mulq    %rbp
170         addq    %rax,%r10
171         movq    8(%rsi),%rax
172         adcq    $0,%rdx
173         movq    8(%rsp),%r10
174         movq    %rdx,%r13
175
176         leaq    1(%r15),%r15
177         jmp     .Linner_enter
178
179 .align  16
180 .Linner:
181         addq    %rax,%r13
182         movq    (%rsi,%r15,8),%rax
183         adcq    $0,%rdx
184         addq    %r10,%r13
185         movq    (%rsp,%r15,8),%r10
186         adcq    $0,%rdx
187         movq    %r13,-16(%rsp,%r15,8)
188         movq    %rdx,%r13
189
190 .Linner_enter:
191         mulq    %rbx
192         addq    %rax,%r11
193         movq    (%rcx,%r15,8),%rax
194         adcq    $0,%rdx
195         addq    %r11,%r10
196         movq    %rdx,%r11
197         adcq    $0,%r11
198         leaq    1(%r15),%r15
199
200         mulq    %rbp
201         cmpq    %r9,%r15
202         jne     .Linner
203
204 .byte   102,72,15,126,195
205
206         addq    %rax,%r13
207         movq    (%rsi),%rax
208         adcq    $0,%rdx
209         addq    %r10,%r13
210         movq    (%rsp,%r15,8),%r10
211         adcq    $0,%rdx
212         movq    %r13,-16(%rsp,%r15,8)
213         movq    %rdx,%r13
214
215         xorq    %rdx,%rdx
216         addq    %r11,%r13
217         adcq    $0,%rdx
218         addq    %r10,%r13
219         adcq    $0,%rdx
220         movq    %r13,-8(%rsp,%r9,8)
221         movq    %rdx,(%rsp,%r9,8)
222
223         leaq    1(%r14),%r14
224         cmpq    %r9,%r14
225         jl      .Louter
226
227         xorq    %r14,%r14
228         movq    (%rsp),%rax
229         leaq    (%rsp),%rsi
230         movq    %r9,%r15
231         jmp     .Lsub
232 .align  16
233 .Lsub:  sbbq    (%rcx,%r14,8),%rax
234         movq    %rax,(%rdi,%r14,8)
235         movq    8(%rsi,%r14,8),%rax
236         leaq    1(%r14),%r14
237         decq    %r15
238         jnz     .Lsub
239
240         sbbq    $0,%rax
241         xorq    %r14,%r14
242         andq    %rax,%rsi
243         notq    %rax
244         movq    %rdi,%rcx
245         andq    %rax,%rcx
246         movq    %r9,%r15
247         orq     %rcx,%rsi
248 .align  16
249 .Lcopy:
250         movq    (%rsi,%r14,8),%rax
251         movq    %r14,(%rsp,%r14,8)
252         movq    %rax,(%rdi,%r14,8)
253         leaq    1(%r14),%r14
254         subq    $1,%r15
255         jnz     .Lcopy
256
257         movq    8(%rsp,%r9,8),%rsi
258         movq    $1,%rax
259         movq    (%rsi),%r15
260         movq    8(%rsi),%r14
261         movq    16(%rsi),%r13
262         movq    24(%rsi),%r12
263         movq    32(%rsi),%rbp
264         movq    40(%rsi),%rbx
265         leaq    48(%rsi),%rsp
266 .Lmul_epilogue:
267         .byte   0xf3,0xc3
268 .size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
269 .type   bn_mul4x_mont_gather5,@function
270 .align  16
271 bn_mul4x_mont_gather5:
272 .Lmul4x_enter:
273         movl    %r9d,%r9d
274         movl    8(%rsp),%r10d
275         pushq   %rbx
276         pushq   %rbp
277         pushq   %r12
278         pushq   %r13
279         pushq   %r14
280         pushq   %r15
281         movq    %rsp,%rax
282         leaq    4(%r9),%r11
283         negq    %r11
284         leaq    (%rsp,%r11,8),%rsp
285         andq    $-1024,%rsp
286
287         movq    %rax,8(%rsp,%r9,8)
288 .Lmul4x_body:
289         movq    %rdi,16(%rsp,%r9,8)
290         movq    %rdx,%r12
291         movq    %r10,%r11
292         shrq    $3,%r10
293         andq    $7,%r11
294         notq    %r10
295         leaq    .Lmagic_masks(%rip),%rax
296         andq    $3,%r10
297         leaq    96(%r12,%r11,8),%r12
298         movq    0(%rax,%r10,8),%xmm4
299         movq    8(%rax,%r10,8),%xmm5
300         movq    16(%rax,%r10,8),%xmm6
301         movq    24(%rax,%r10,8),%xmm7
302
303         movq    -96(%r12),%xmm0
304         movq    -32(%r12),%xmm1
305         pand    %xmm4,%xmm0
306         movq    32(%r12),%xmm2
307         pand    %xmm5,%xmm1
308         movq    96(%r12),%xmm3
309         pand    %xmm6,%xmm2
310         por     %xmm1,%xmm0
311         pand    %xmm7,%xmm3
312         por     %xmm2,%xmm0
313         leaq    256(%r12),%r12
314         por     %xmm3,%xmm0
315
316 .byte   102,72,15,126,195
317         movq    (%r8),%r8
318         movq    (%rsi),%rax
319
320         xorq    %r14,%r14
321         xorq    %r15,%r15
322
323         movq    -96(%r12),%xmm0
324         movq    -32(%r12),%xmm1
325         pand    %xmm4,%xmm0
326         movq    32(%r12),%xmm2
327         pand    %xmm5,%xmm1
328
329         movq    %r8,%rbp
330         mulq    %rbx
331         movq    %rax,%r10
332         movq    (%rcx),%rax
333
334         movq    96(%r12),%xmm3
335         pand    %xmm6,%xmm2
336         por     %xmm1,%xmm0
337         pand    %xmm7,%xmm3
338
339         imulq   %r10,%rbp
340         movq    %rdx,%r11
341
342         por     %xmm2,%xmm0
343         leaq    256(%r12),%r12
344         por     %xmm3,%xmm0
345
346         mulq    %rbp
347         addq    %rax,%r10
348         movq    8(%rsi),%rax
349         adcq    $0,%rdx
350         movq    %rdx,%rdi
351
352         mulq    %rbx
353         addq    %rax,%r11
354         movq    8(%rcx),%rax
355         adcq    $0,%rdx
356         movq    %rdx,%r10
357
358         mulq    %rbp
359         addq    %rax,%rdi
360         movq    16(%rsi),%rax
361         adcq    $0,%rdx
362         addq    %r11,%rdi
363         leaq    4(%r15),%r15
364         adcq    $0,%rdx
365         movq    %rdi,(%rsp)
366         movq    %rdx,%r13
367         jmp     .L1st4x
368 .align  16
369 .L1st4x:
370         mulq    %rbx
371         addq    %rax,%r10
372         movq    -16(%rcx,%r15,8),%rax
373         adcq    $0,%rdx
374         movq    %rdx,%r11
375
376         mulq    %rbp
377         addq    %rax,%r13
378         movq    -8(%rsi,%r15,8),%rax
379         adcq    $0,%rdx
380         addq    %r10,%r13
381         adcq    $0,%rdx
382         movq    %r13,-24(%rsp,%r15,8)
383         movq    %rdx,%rdi
384
385         mulq    %rbx
386         addq    %rax,%r11
387         movq    -8(%rcx,%r15,8),%rax
388         adcq    $0,%rdx
389         movq    %rdx,%r10
390
391         mulq    %rbp
392         addq    %rax,%rdi
393         movq    (%rsi,%r15,8),%rax
394         adcq    $0,%rdx
395         addq    %r11,%rdi
396         adcq    $0,%rdx
397         movq    %rdi,-16(%rsp,%r15,8)
398         movq    %rdx,%r13
399
400         mulq    %rbx
401         addq    %rax,%r10
402         movq    (%rcx,%r15,8),%rax
403         adcq    $0,%rdx
404         movq    %rdx,%r11
405
406         mulq    %rbp
407         addq    %rax,%r13
408         movq    8(%rsi,%r15,8),%rax
409         adcq    $0,%rdx
410         addq    %r10,%r13
411         adcq    $0,%rdx
412         movq    %r13,-8(%rsp,%r15,8)
413         movq    %rdx,%rdi
414
415         mulq    %rbx
416         addq    %rax,%r11
417         movq    8(%rcx,%r15,8),%rax
418         adcq    $0,%rdx
419         leaq    4(%r15),%r15
420         movq    %rdx,%r10
421
422         mulq    %rbp
423         addq    %rax,%rdi
424         movq    -16(%rsi,%r15,8),%rax
425         adcq    $0,%rdx
426         addq    %r11,%rdi
427         adcq    $0,%rdx
428         movq    %rdi,-32(%rsp,%r15,8)
429         movq    %rdx,%r13
430         cmpq    %r9,%r15
431         jl      .L1st4x
432
433         mulq    %rbx
434         addq    %rax,%r10
435         movq    -16(%rcx,%r15,8),%rax
436         adcq    $0,%rdx
437         movq    %rdx,%r11
438
439         mulq    %rbp
440         addq    %rax,%r13
441         movq    -8(%rsi,%r15,8),%rax
442         adcq    $0,%rdx
443         addq    %r10,%r13
444         adcq    $0,%rdx
445         movq    %r13,-24(%rsp,%r15,8)
446         movq    %rdx,%rdi
447
448         mulq    %rbx
449         addq    %rax,%r11
450         movq    -8(%rcx,%r15,8),%rax
451         adcq    $0,%rdx
452         movq    %rdx,%r10
453
454         mulq    %rbp
455         addq    %rax,%rdi
456         movq    (%rsi),%rax
457         adcq    $0,%rdx
458         addq    %r11,%rdi
459         adcq    $0,%rdx
460         movq    %rdi,-16(%rsp,%r15,8)
461         movq    %rdx,%r13
462
463 .byte   102,72,15,126,195
464
465         xorq    %rdi,%rdi
466         addq    %r10,%r13
467         adcq    $0,%rdi
468         movq    %r13,-8(%rsp,%r15,8)
469         movq    %rdi,(%rsp,%r15,8)
470
471         leaq    1(%r14),%r14
472 .align  4
473 .Louter4x:
474         xorq    %r15,%r15
475         movq    -96(%r12),%xmm0
476         movq    -32(%r12),%xmm1
477         pand    %xmm4,%xmm0
478         movq    32(%r12),%xmm2
479         pand    %xmm5,%xmm1
480
481         movq    (%rsp),%r10
482         movq    %r8,%rbp
483         mulq    %rbx
484         addq    %rax,%r10
485         movq    (%rcx),%rax
486         adcq    $0,%rdx
487
488         movq    96(%r12),%xmm3
489         pand    %xmm6,%xmm2
490         por     %xmm1,%xmm0
491         pand    %xmm7,%xmm3
492
493         imulq   %r10,%rbp
494         movq    %rdx,%r11
495
496         por     %xmm2,%xmm0
497         leaq    256(%r12),%r12
498         por     %xmm3,%xmm0
499
500         mulq    %rbp
501         addq    %rax,%r10
502         movq    8(%rsi),%rax
503         adcq    $0,%rdx
504         movq    %rdx,%rdi
505
506         mulq    %rbx
507         addq    %rax,%r11
508         movq    8(%rcx),%rax
509         adcq    $0,%rdx
510         addq    8(%rsp),%r11
511         adcq    $0,%rdx
512         movq    %rdx,%r10
513
514         mulq    %rbp
515         addq    %rax,%rdi
516         movq    16(%rsi),%rax
517         adcq    $0,%rdx
518         addq    %r11,%rdi
519         leaq    4(%r15),%r15
520         adcq    $0,%rdx
521         movq    %rdx,%r13
522         jmp     .Linner4x
523 .align  16
524 .Linner4x:
525         mulq    %rbx
526         addq    %rax,%r10
527         movq    -16(%rcx,%r15,8),%rax
528         adcq    $0,%rdx
529         addq    -16(%rsp,%r15,8),%r10
530         adcq    $0,%rdx
531         movq    %rdx,%r11
532
533         mulq    %rbp
534         addq    %rax,%r13
535         movq    -8(%rsi,%r15,8),%rax
536         adcq    $0,%rdx
537         addq    %r10,%r13
538         adcq    $0,%rdx
539         movq    %rdi,-32(%rsp,%r15,8)
540         movq    %rdx,%rdi
541
542         mulq    %rbx
543         addq    %rax,%r11
544         movq    -8(%rcx,%r15,8),%rax
545         adcq    $0,%rdx
546         addq    -8(%rsp,%r15,8),%r11
547         adcq    $0,%rdx
548         movq    %rdx,%r10
549
550         mulq    %rbp
551         addq    %rax,%rdi
552         movq    (%rsi,%r15,8),%rax
553         adcq    $0,%rdx
554         addq    %r11,%rdi
555         adcq    $0,%rdx
556         movq    %r13,-24(%rsp,%r15,8)
557         movq    %rdx,%r13
558
559         mulq    %rbx
560         addq    %rax,%r10
561         movq    (%rcx,%r15,8),%rax
562         adcq    $0,%rdx
563         addq    (%rsp,%r15,8),%r10
564         adcq    $0,%rdx
565         movq    %rdx,%r11
566
567         mulq    %rbp
568         addq    %rax,%r13
569         movq    8(%rsi,%r15,8),%rax
570         adcq    $0,%rdx
571         addq    %r10,%r13
572         adcq    $0,%rdx
573         movq    %rdi,-16(%rsp,%r15,8)
574         movq    %rdx,%rdi
575
576         mulq    %rbx
577         addq    %rax,%r11
578         movq    8(%rcx,%r15,8),%rax
579         adcq    $0,%rdx
580         addq    8(%rsp,%r15,8),%r11
581         adcq    $0,%rdx
582         leaq    4(%r15),%r15
583         movq    %rdx,%r10
584
585         mulq    %rbp
586         addq    %rax,%rdi
587         movq    -16(%rsi,%r15,8),%rax
588         adcq    $0,%rdx
589         addq    %r11,%rdi
590         adcq    $0,%rdx
591         movq    %r13,-40(%rsp,%r15,8)
592         movq    %rdx,%r13
593         cmpq    %r9,%r15
594         jl      .Linner4x
595
596         mulq    %rbx
597         addq    %rax,%r10
598         movq    -16(%rcx,%r15,8),%rax
599         adcq    $0,%rdx
600         addq    -16(%rsp,%r15,8),%r10
601         adcq    $0,%rdx
602         movq    %rdx,%r11
603
604         mulq    %rbp
605         addq    %rax,%r13
606         movq    -8(%rsi,%r15,8),%rax
607         adcq    $0,%rdx
608         addq    %r10,%r13
609         adcq    $0,%rdx
610         movq    %rdi,-32(%rsp,%r15,8)
611         movq    %rdx,%rdi
612
613         mulq    %rbx
614         addq    %rax,%r11
615         movq    -8(%rcx,%r15,8),%rax
616         adcq    $0,%rdx
617         addq    -8(%rsp,%r15,8),%r11
618         adcq    $0,%rdx
619         leaq    1(%r14),%r14
620         movq    %rdx,%r10
621
622         mulq    %rbp
623         addq    %rax,%rdi
624         movq    (%rsi),%rax
625         adcq    $0,%rdx
626         addq    %r11,%rdi
627         adcq    $0,%rdx
628         movq    %r13,-24(%rsp,%r15,8)
629         movq    %rdx,%r13
630
631 .byte   102,72,15,126,195
632         movq    %rdi,-16(%rsp,%r15,8)
633
634         xorq    %rdi,%rdi
635         addq    %r10,%r13
636         adcq    $0,%rdi
637         addq    (%rsp,%r9,8),%r13
638         adcq    $0,%rdi
639         movq    %r13,-8(%rsp,%r15,8)
640         movq    %rdi,(%rsp,%r15,8)
641
642         cmpq    %r9,%r14
643         jl      .Louter4x
644         movq    16(%rsp,%r9,8),%rdi
645         movq    0(%rsp),%rax
646         pxor    %xmm0,%xmm0
647         movq    8(%rsp),%rdx
648         shrq    $2,%r9
649         leaq    (%rsp),%rsi
650         xorq    %r14,%r14
651
652         subq    0(%rcx),%rax
653         movq    16(%rsi),%rbx
654         movq    24(%rsi),%rbp
655         sbbq    8(%rcx),%rdx
656         leaq    -1(%r9),%r15
657         jmp     .Lsub4x
658 .align  16
659 .Lsub4x:
660         movq    %rax,0(%rdi,%r14,8)
661         movq    %rdx,8(%rdi,%r14,8)
662         sbbq    16(%rcx,%r14,8),%rbx
663         movq    32(%rsi,%r14,8),%rax
664         movq    40(%rsi,%r14,8),%rdx
665         sbbq    24(%rcx,%r14,8),%rbp
666         movq    %rbx,16(%rdi,%r14,8)
667         movq    %rbp,24(%rdi,%r14,8)
668         sbbq    32(%rcx,%r14,8),%rax
669         movq    48(%rsi,%r14,8),%rbx
670         movq    56(%rsi,%r14,8),%rbp
671         sbbq    40(%rcx,%r14,8),%rdx
672         leaq    4(%r14),%r14
673         decq    %r15
674         jnz     .Lsub4x
675
676         movq    %rax,0(%rdi,%r14,8)
677         movq    32(%rsi,%r14,8),%rax
678         sbbq    16(%rcx,%r14,8),%rbx
679         movq    %rdx,8(%rdi,%r14,8)
680         sbbq    24(%rcx,%r14,8),%rbp
681         movq    %rbx,16(%rdi,%r14,8)
682
683         sbbq    $0,%rax
684         movq    %rbp,24(%rdi,%r14,8)
685         xorq    %r14,%r14
686         andq    %rax,%rsi
687         notq    %rax
688         movq    %rdi,%rcx
689         andq    %rax,%rcx
690         leaq    -1(%r9),%r15
691         orq     %rcx,%rsi
692
693         movdqu  (%rsi),%xmm1
694         movdqa  %xmm0,(%rsp)
695         movdqu  %xmm1,(%rdi)
696         jmp     .Lcopy4x
697 .align  16
698 .Lcopy4x:
699         movdqu  16(%rsi,%r14,1),%xmm2
700         movdqu  32(%rsi,%r14,1),%xmm1
701         movdqa  %xmm0,16(%rsp,%r14,1)
702         movdqu  %xmm2,16(%rdi,%r14,1)
703         movdqa  %xmm0,32(%rsp,%r14,1)
704         movdqu  %xmm1,32(%rdi,%r14,1)
705         leaq    32(%r14),%r14
706         decq    %r15
707         jnz     .Lcopy4x
708
709         shlq    $2,%r9
710         movdqu  16(%rsi,%r14,1),%xmm2
711         movdqa  %xmm0,16(%rsp,%r14,1)
712         movdqu  %xmm2,16(%rdi,%r14,1)
713         movq    8(%rsp,%r9,8),%rsi
714         movq    $1,%rax
715         movq    (%rsi),%r15
716         movq    8(%rsi),%r14
717         movq    16(%rsi),%r13
718         movq    24(%rsi),%r12
719         movq    32(%rsi),%rbp
720         movq    40(%rsi),%rbx
721         leaq    48(%rsi),%rsp
722 .Lmul4x_epilogue:
723         .byte   0xf3,0xc3
724 .size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
725 .globl  bn_scatter5
726 .type   bn_scatter5,@function
727 .align  16
728 bn_scatter5:
729         cmpq    $0,%rsi
730         jz      .Lscatter_epilogue
731         leaq    (%rdx,%rcx,8),%rdx
732 .Lscatter:
733         movq    (%rdi),%rax
734         leaq    8(%rdi),%rdi
735         movq    %rax,(%rdx)
736         leaq    256(%rdx),%rdx
737         subq    $1,%rsi
738         jnz     .Lscatter
739 .Lscatter_epilogue:
740         .byte   0xf3,0xc3
741 .size   bn_scatter5,.-bn_scatter5
742
743 .globl  bn_gather5
744 .type   bn_gather5,@function
745 .align  16
746 bn_gather5:
747         movq    %rcx,%r11
748         shrq    $3,%rcx
749         andq    $7,%r11
750         notq    %rcx
751         leaq    .Lmagic_masks(%rip),%rax
752         andq    $3,%rcx
753         leaq    96(%rdx,%r11,8),%rdx
754         movq    0(%rax,%rcx,8),%xmm4
755         movq    8(%rax,%rcx,8),%xmm5
756         movq    16(%rax,%rcx,8),%xmm6
757         movq    24(%rax,%rcx,8),%xmm7
758         jmp     .Lgather
759 .align  16
760 .Lgather:
761         movq    -96(%rdx),%xmm0
762         movq    -32(%rdx),%xmm1
763         pand    %xmm4,%xmm0
764         movq    32(%rdx),%xmm2
765         pand    %xmm5,%xmm1
766         movq    96(%rdx),%xmm3
767         pand    %xmm6,%xmm2
768         por     %xmm1,%xmm0
769         pand    %xmm7,%xmm3
770         por     %xmm2,%xmm0
771         leaq    256(%rdx),%rdx
772         por     %xmm3,%xmm0
773
774         movq    %xmm0,(%rdi)
775         leaq    8(%rdi),%rdi
776         subq    $1,%rsi
777         jnz     .Lgather
778         .byte   0xf3,0xc3
779 .LSEH_end_bn_gather5:
780 .size   bn_gather5,.-bn_gather5
781 .align  64
782 .Lmagic_masks:
783 .long   0,0, 0,0, 0,0, -1,-1
784 .long   0,0, 0,0, 0,0,  0,0
785 .byte   77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0