]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/openssl/amd64/poly1305-x86_64.S
Move generated OpenSSL assembly routines into the kernel sources.
[FreeBSD/FreeBSD.git] / sys / crypto / openssl / amd64 / poly1305-x86_64.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */
3 .text   
4
5
6
7 .globl  poly1305_init
8 .hidden poly1305_init
9 .globl  poly1305_blocks
10 .hidden poly1305_blocks
11 .globl  poly1305_emit
12 .hidden poly1305_emit
13
14 .type   poly1305_init,@function
15 .align  32
16 poly1305_init:
17 .cfi_startproc  
18         xorq    %rax,%rax
19         movq    %rax,0(%rdi)
20         movq    %rax,8(%rdi)
21         movq    %rax,16(%rdi)
22
23         cmpq    $0,%rsi
24         je      .Lno_key
25
26         leaq    poly1305_blocks(%rip),%r10
27         leaq    poly1305_emit(%rip),%r11
28         movq    OPENSSL_ia32cap_P+4(%rip),%r9
29         leaq    poly1305_blocks_avx(%rip),%rax
30         leaq    poly1305_emit_avx(%rip),%rcx
31         btq     $28,%r9
32         cmovcq  %rax,%r10
33         cmovcq  %rcx,%r11
34         leaq    poly1305_blocks_avx2(%rip),%rax
35         btq     $37,%r9
36         cmovcq  %rax,%r10
37         movq    $0x0ffffffc0fffffff,%rax
38         movq    $0x0ffffffc0ffffffc,%rcx
39         andq    0(%rsi),%rax
40         andq    8(%rsi),%rcx
41         movq    %rax,24(%rdi)
42         movq    %rcx,32(%rdi)
43         movq    %r10,0(%rdx)
44         movq    %r11,8(%rdx)
45         movl    $1,%eax
46 .Lno_key:
47         .byte   0xf3,0xc3
48 .cfi_endproc    
49 .size   poly1305_init,.-poly1305_init
50
51 .type   poly1305_blocks,@function
52 .align  32
53 poly1305_blocks:
54 .cfi_startproc  
55 .Lblocks:
56         shrq    $4,%rdx
57         jz      .Lno_data
58
59         pushq   %rbx
60 .cfi_adjust_cfa_offset  8
61 .cfi_offset     %rbx,-16
62         pushq   %rbp
63 .cfi_adjust_cfa_offset  8
64 .cfi_offset     %rbp,-24
65         pushq   %r12
66 .cfi_adjust_cfa_offset  8
67 .cfi_offset     %r12,-32
68         pushq   %r13
69 .cfi_adjust_cfa_offset  8
70 .cfi_offset     %r13,-40
71         pushq   %r14
72 .cfi_adjust_cfa_offset  8
73 .cfi_offset     %r14,-48
74         pushq   %r15
75 .cfi_adjust_cfa_offset  8
76 .cfi_offset     %r15,-56
77 .Lblocks_body:
78
79         movq    %rdx,%r15
80
81         movq    24(%rdi),%r11
82         movq    32(%rdi),%r13
83
84         movq    0(%rdi),%r14
85         movq    8(%rdi),%rbx
86         movq    16(%rdi),%rbp
87
88         movq    %r13,%r12
89         shrq    $2,%r13
90         movq    %r12,%rax
91         addq    %r12,%r13
92         jmp     .Loop
93
94 .align  32
95 .Loop:
96         addq    0(%rsi),%r14
97         adcq    8(%rsi),%rbx
98         leaq    16(%rsi),%rsi
99         adcq    %rcx,%rbp
100         mulq    %r14
101         movq    %rax,%r9
102         movq    %r11,%rax
103         movq    %rdx,%r10
104
105         mulq    %r14
106         movq    %rax,%r14
107         movq    %r11,%rax
108         movq    %rdx,%r8
109
110         mulq    %rbx
111         addq    %rax,%r9
112         movq    %r13,%rax
113         adcq    %rdx,%r10
114
115         mulq    %rbx
116         movq    %rbp,%rbx
117         addq    %rax,%r14
118         adcq    %rdx,%r8
119
120         imulq   %r13,%rbx
121         addq    %rbx,%r9
122         movq    %r8,%rbx
123         adcq    $0,%r10
124
125         imulq   %r11,%rbp
126         addq    %r9,%rbx
127         movq    $-4,%rax
128         adcq    %rbp,%r10
129
130         andq    %r10,%rax
131         movq    %r10,%rbp
132         shrq    $2,%r10
133         andq    $3,%rbp
134         addq    %r10,%rax
135         addq    %rax,%r14
136         adcq    $0,%rbx
137         adcq    $0,%rbp
138         movq    %r12,%rax
139         decq    %r15
140         jnz     .Loop
141
142         movq    %r14,0(%rdi)
143         movq    %rbx,8(%rdi)
144         movq    %rbp,16(%rdi)
145
146         movq    0(%rsp),%r15
147 .cfi_restore    %r15
148         movq    8(%rsp),%r14
149 .cfi_restore    %r14
150         movq    16(%rsp),%r13
151 .cfi_restore    %r13
152         movq    24(%rsp),%r12
153 .cfi_restore    %r12
154         movq    32(%rsp),%rbp
155 .cfi_restore    %rbp
156         movq    40(%rsp),%rbx
157 .cfi_restore    %rbx
158         leaq    48(%rsp),%rsp
159 .cfi_adjust_cfa_offset  -48
160 .Lno_data:
161 .Lblocks_epilogue:
162         .byte   0xf3,0xc3
163 .cfi_endproc    
164 .size   poly1305_blocks,.-poly1305_blocks
165
166 .type   poly1305_emit,@function
167 .align  32
168 poly1305_emit:
169 .cfi_startproc  
170 .Lemit:
171         movq    0(%rdi),%r8
172         movq    8(%rdi),%r9
173         movq    16(%rdi),%r10
174
175         movq    %r8,%rax
176         addq    $5,%r8
177         movq    %r9,%rcx
178         adcq    $0,%r9
179         adcq    $0,%r10
180         shrq    $2,%r10
181         cmovnzq %r8,%rax
182         cmovnzq %r9,%rcx
183
184         addq    0(%rdx),%rax
185         adcq    8(%rdx),%rcx
186         movq    %rax,0(%rsi)
187         movq    %rcx,8(%rsi)
188
189         .byte   0xf3,0xc3
190 .cfi_endproc    
191 .size   poly1305_emit,.-poly1305_emit
192 .type   __poly1305_block,@function
193 .align  32
194 __poly1305_block:
195 .cfi_startproc  
196         mulq    %r14
197         movq    %rax,%r9
198         movq    %r11,%rax
199         movq    %rdx,%r10
200
201         mulq    %r14
202         movq    %rax,%r14
203         movq    %r11,%rax
204         movq    %rdx,%r8
205
206         mulq    %rbx
207         addq    %rax,%r9
208         movq    %r13,%rax
209         adcq    %rdx,%r10
210
211         mulq    %rbx
212         movq    %rbp,%rbx
213         addq    %rax,%r14
214         adcq    %rdx,%r8
215
216         imulq   %r13,%rbx
217         addq    %rbx,%r9
218         movq    %r8,%rbx
219         adcq    $0,%r10
220
221         imulq   %r11,%rbp
222         addq    %r9,%rbx
223         movq    $-4,%rax
224         adcq    %rbp,%r10
225
226         andq    %r10,%rax
227         movq    %r10,%rbp
228         shrq    $2,%r10
229         andq    $3,%rbp
230         addq    %r10,%rax
231         addq    %rax,%r14
232         adcq    $0,%rbx
233         adcq    $0,%rbp
234         .byte   0xf3,0xc3
235 .cfi_endproc    
236 .size   __poly1305_block,.-__poly1305_block
237
238 .type   __poly1305_init_avx,@function
239 .align  32
240 __poly1305_init_avx:
241 .cfi_startproc  
242         movq    %r11,%r14
243         movq    %r12,%rbx
244         xorq    %rbp,%rbp
245
246         leaq    48+64(%rdi),%rdi
247
248         movq    %r12,%rax
249         call    __poly1305_block
250
251         movl    $0x3ffffff,%eax
252         movl    $0x3ffffff,%edx
253         movq    %r14,%r8
254         andl    %r14d,%eax
255         movq    %r11,%r9
256         andl    %r11d,%edx
257         movl    %eax,-64(%rdi)
258         shrq    $26,%r8
259         movl    %edx,-60(%rdi)
260         shrq    $26,%r9
261
262         movl    $0x3ffffff,%eax
263         movl    $0x3ffffff,%edx
264         andl    %r8d,%eax
265         andl    %r9d,%edx
266         movl    %eax,-48(%rdi)
267         leal    (%rax,%rax,4),%eax
268         movl    %edx,-44(%rdi)
269         leal    (%rdx,%rdx,4),%edx
270         movl    %eax,-32(%rdi)
271         shrq    $26,%r8
272         movl    %edx,-28(%rdi)
273         shrq    $26,%r9
274
275         movq    %rbx,%rax
276         movq    %r12,%rdx
277         shlq    $12,%rax
278         shlq    $12,%rdx
279         orq     %r8,%rax
280         orq     %r9,%rdx
281         andl    $0x3ffffff,%eax
282         andl    $0x3ffffff,%edx
283         movl    %eax,-16(%rdi)
284         leal    (%rax,%rax,4),%eax
285         movl    %edx,-12(%rdi)
286         leal    (%rdx,%rdx,4),%edx
287         movl    %eax,0(%rdi)
288         movq    %rbx,%r8
289         movl    %edx,4(%rdi)
290         movq    %r12,%r9
291
292         movl    $0x3ffffff,%eax
293         movl    $0x3ffffff,%edx
294         shrq    $14,%r8
295         shrq    $14,%r9
296         andl    %r8d,%eax
297         andl    %r9d,%edx
298         movl    %eax,16(%rdi)
299         leal    (%rax,%rax,4),%eax
300         movl    %edx,20(%rdi)
301         leal    (%rdx,%rdx,4),%edx
302         movl    %eax,32(%rdi)
303         shrq    $26,%r8
304         movl    %edx,36(%rdi)
305         shrq    $26,%r9
306
307         movq    %rbp,%rax
308         shlq    $24,%rax
309         orq     %rax,%r8
310         movl    %r8d,48(%rdi)
311         leaq    (%r8,%r8,4),%r8
312         movl    %r9d,52(%rdi)
313         leaq    (%r9,%r9,4),%r9
314         movl    %r8d,64(%rdi)
315         movl    %r9d,68(%rdi)
316
317         movq    %r12,%rax
318         call    __poly1305_block
319
320         movl    $0x3ffffff,%eax
321         movq    %r14,%r8
322         andl    %r14d,%eax
323         shrq    $26,%r8
324         movl    %eax,-52(%rdi)
325
326         movl    $0x3ffffff,%edx
327         andl    %r8d,%edx
328         movl    %edx,-36(%rdi)
329         leal    (%rdx,%rdx,4),%edx
330         shrq    $26,%r8
331         movl    %edx,-20(%rdi)
332
333         movq    %rbx,%rax
334         shlq    $12,%rax
335         orq     %r8,%rax
336         andl    $0x3ffffff,%eax
337         movl    %eax,-4(%rdi)
338         leal    (%rax,%rax,4),%eax
339         movq    %rbx,%r8
340         movl    %eax,12(%rdi)
341
342         movl    $0x3ffffff,%edx
343         shrq    $14,%r8
344         andl    %r8d,%edx
345         movl    %edx,28(%rdi)
346         leal    (%rdx,%rdx,4),%edx
347         shrq    $26,%r8
348         movl    %edx,44(%rdi)
349
350         movq    %rbp,%rax
351         shlq    $24,%rax
352         orq     %rax,%r8
353         movl    %r8d,60(%rdi)
354         leaq    (%r8,%r8,4),%r8
355         movl    %r8d,76(%rdi)
356
357         movq    %r12,%rax
358         call    __poly1305_block
359
360         movl    $0x3ffffff,%eax
361         movq    %r14,%r8
362         andl    %r14d,%eax
363         shrq    $26,%r8
364         movl    %eax,-56(%rdi)
365
366         movl    $0x3ffffff,%edx
367         andl    %r8d,%edx
368         movl    %edx,-40(%rdi)
369         leal    (%rdx,%rdx,4),%edx
370         shrq    $26,%r8
371         movl    %edx,-24(%rdi)
372
373         movq    %rbx,%rax
374         shlq    $12,%rax
375         orq     %r8,%rax
376         andl    $0x3ffffff,%eax
377         movl    %eax,-8(%rdi)
378         leal    (%rax,%rax,4),%eax
379         movq    %rbx,%r8
380         movl    %eax,8(%rdi)
381
382         movl    $0x3ffffff,%edx
383         shrq    $14,%r8
384         andl    %r8d,%edx
385         movl    %edx,24(%rdi)
386         leal    (%rdx,%rdx,4),%edx
387         shrq    $26,%r8
388         movl    %edx,40(%rdi)
389
390         movq    %rbp,%rax
391         shlq    $24,%rax
392         orq     %rax,%r8
393         movl    %r8d,56(%rdi)
394         leaq    (%r8,%r8,4),%r8
395         movl    %r8d,72(%rdi)
396
397         leaq    -48-64(%rdi),%rdi
398         .byte   0xf3,0xc3
399 .cfi_endproc    
400 .size   __poly1305_init_avx,.-__poly1305_init_avx
401
402 .type   poly1305_blocks_avx,@function
403 .align  32
404 poly1305_blocks_avx:
405 .cfi_startproc  
406         movl    20(%rdi),%r8d
407         cmpq    $128,%rdx
408         jae     .Lblocks_avx
409         testl   %r8d,%r8d
410         jz      .Lblocks
411
412 .Lblocks_avx:
413         andq    $-16,%rdx
414         jz      .Lno_data_avx
415
416         vzeroupper
417
418         testl   %r8d,%r8d
419         jz      .Lbase2_64_avx
420
421         testq   $31,%rdx
422         jz      .Leven_avx
423
424         pushq   %rbx
425 .cfi_adjust_cfa_offset  8
426 .cfi_offset     %rbx,-16
427         pushq   %rbp
428 .cfi_adjust_cfa_offset  8
429 .cfi_offset     %rbp,-24
430         pushq   %r12
431 .cfi_adjust_cfa_offset  8
432 .cfi_offset     %r12,-32
433         pushq   %r13
434 .cfi_adjust_cfa_offset  8
435 .cfi_offset     %r13,-40
436         pushq   %r14
437 .cfi_adjust_cfa_offset  8
438 .cfi_offset     %r14,-48
439         pushq   %r15
440 .cfi_adjust_cfa_offset  8
441 .cfi_offset     %r15,-56
442 .Lblocks_avx_body:
443
444         movq    %rdx,%r15
445
446         movq    0(%rdi),%r8
447         movq    8(%rdi),%r9
448         movl    16(%rdi),%ebp
449
450         movq    24(%rdi),%r11
451         movq    32(%rdi),%r13
452
453
454         movl    %r8d,%r14d
455         andq    $-2147483648,%r8
456         movq    %r9,%r12
457         movl    %r9d,%ebx
458         andq    $-2147483648,%r9
459
460         shrq    $6,%r8
461         shlq    $52,%r12
462         addq    %r8,%r14
463         shrq    $12,%rbx
464         shrq    $18,%r9
465         addq    %r12,%r14
466         adcq    %r9,%rbx
467
468         movq    %rbp,%r8
469         shlq    $40,%r8
470         shrq    $24,%rbp
471         addq    %r8,%rbx
472         adcq    $0,%rbp
473
474         movq    $-4,%r9
475         movq    %rbp,%r8
476         andq    %rbp,%r9
477         shrq    $2,%r8
478         andq    $3,%rbp
479         addq    %r9,%r8
480         addq    %r8,%r14
481         adcq    $0,%rbx
482         adcq    $0,%rbp
483
484         movq    %r13,%r12
485         movq    %r13,%rax
486         shrq    $2,%r13
487         addq    %r12,%r13
488
489         addq    0(%rsi),%r14
490         adcq    8(%rsi),%rbx
491         leaq    16(%rsi),%rsi
492         adcq    %rcx,%rbp
493
494         call    __poly1305_block
495
496         testq   %rcx,%rcx
497         jz      .Lstore_base2_64_avx
498
499
500         movq    %r14,%rax
501         movq    %r14,%rdx
502         shrq    $52,%r14
503         movq    %rbx,%r11
504         movq    %rbx,%r12
505         shrq    $26,%rdx
506         andq    $0x3ffffff,%rax
507         shlq    $12,%r11
508         andq    $0x3ffffff,%rdx
509         shrq    $14,%rbx
510         orq     %r11,%r14
511         shlq    $24,%rbp
512         andq    $0x3ffffff,%r14
513         shrq    $40,%r12
514         andq    $0x3ffffff,%rbx
515         orq     %r12,%rbp
516
517         subq    $16,%r15
518         jz      .Lstore_base2_26_avx
519
520         vmovd   %eax,%xmm0
521         vmovd   %edx,%xmm1
522         vmovd   %r14d,%xmm2
523         vmovd   %ebx,%xmm3
524         vmovd   %ebp,%xmm4
525         jmp     .Lproceed_avx
526
527 .align  32
528 .Lstore_base2_64_avx:
529         movq    %r14,0(%rdi)
530         movq    %rbx,8(%rdi)
531         movq    %rbp,16(%rdi)
532         jmp     .Ldone_avx
533
534 .align  16
535 .Lstore_base2_26_avx:
536         movl    %eax,0(%rdi)
537         movl    %edx,4(%rdi)
538         movl    %r14d,8(%rdi)
539         movl    %ebx,12(%rdi)
540         movl    %ebp,16(%rdi)
541 .align  16
542 .Ldone_avx:
543         movq    0(%rsp),%r15
544 .cfi_restore    %r15
545         movq    8(%rsp),%r14
546 .cfi_restore    %r14
547         movq    16(%rsp),%r13
548 .cfi_restore    %r13
549         movq    24(%rsp),%r12
550 .cfi_restore    %r12
551         movq    32(%rsp),%rbp
552 .cfi_restore    %rbp
553         movq    40(%rsp),%rbx
554 .cfi_restore    %rbx
555         leaq    48(%rsp),%rsp
556 .cfi_adjust_cfa_offset  -48
557 .Lno_data_avx:
558 .Lblocks_avx_epilogue:
559         .byte   0xf3,0xc3
560 .cfi_endproc    
561
562 .align  32
563 .Lbase2_64_avx:
564 .cfi_startproc  
565         pushq   %rbx
566 .cfi_adjust_cfa_offset  8
567 .cfi_offset     %rbx,-16
568         pushq   %rbp
569 .cfi_adjust_cfa_offset  8
570 .cfi_offset     %rbp,-24
571         pushq   %r12
572 .cfi_adjust_cfa_offset  8
573 .cfi_offset     %r12,-32
574         pushq   %r13
575 .cfi_adjust_cfa_offset  8
576 .cfi_offset     %r13,-40
577         pushq   %r14
578 .cfi_adjust_cfa_offset  8
579 .cfi_offset     %r14,-48
580         pushq   %r15
581 .cfi_adjust_cfa_offset  8
582 .cfi_offset     %r15,-56
583 .Lbase2_64_avx_body:
584
585         movq    %rdx,%r15
586
587         movq    24(%rdi),%r11
588         movq    32(%rdi),%r13
589
590         movq    0(%rdi),%r14
591         movq    8(%rdi),%rbx
592         movl    16(%rdi),%ebp
593
594         movq    %r13,%r12
595         movq    %r13,%rax
596         shrq    $2,%r13
597         addq    %r12,%r13
598
599         testq   $31,%rdx
600         jz      .Linit_avx
601
602         addq    0(%rsi),%r14
603         adcq    8(%rsi),%rbx
604         leaq    16(%rsi),%rsi
605         adcq    %rcx,%rbp
606         subq    $16,%r15
607
608         call    __poly1305_block
609
610 .Linit_avx:
611
612         movq    %r14,%rax
613         movq    %r14,%rdx
614         shrq    $52,%r14
615         movq    %rbx,%r8
616         movq    %rbx,%r9
617         shrq    $26,%rdx
618         andq    $0x3ffffff,%rax
619         shlq    $12,%r8
620         andq    $0x3ffffff,%rdx
621         shrq    $14,%rbx
622         orq     %r8,%r14
623         shlq    $24,%rbp
624         andq    $0x3ffffff,%r14
625         shrq    $40,%r9
626         andq    $0x3ffffff,%rbx
627         orq     %r9,%rbp
628
629         vmovd   %eax,%xmm0
630         vmovd   %edx,%xmm1
631         vmovd   %r14d,%xmm2
632         vmovd   %ebx,%xmm3
633         vmovd   %ebp,%xmm4
634         movl    $1,20(%rdi)
635
636         call    __poly1305_init_avx
637
638 .Lproceed_avx:
639         movq    %r15,%rdx
640
641         movq    0(%rsp),%r15
642 .cfi_restore    %r15
643         movq    8(%rsp),%r14
644 .cfi_restore    %r14
645         movq    16(%rsp),%r13
646 .cfi_restore    %r13
647         movq    24(%rsp),%r12
648 .cfi_restore    %r12
649         movq    32(%rsp),%rbp
650 .cfi_restore    %rbp
651         movq    40(%rsp),%rbx
652 .cfi_restore    %rbx
653         leaq    48(%rsp),%rax
654         leaq    48(%rsp),%rsp
655 .cfi_adjust_cfa_offset  -48
656 .Lbase2_64_avx_epilogue:
657         jmp     .Ldo_avx
658 .cfi_endproc    
659
660 .align  32
661 .Leven_avx:
662 .cfi_startproc  
663         vmovd   0(%rdi),%xmm0
664         vmovd   4(%rdi),%xmm1
665         vmovd   8(%rdi),%xmm2
666         vmovd   12(%rdi),%xmm3
667         vmovd   16(%rdi),%xmm4
668
669 .Ldo_avx:
670         leaq    -88(%rsp),%r11
671 .cfi_def_cfa    %r11,0x60
672         subq    $0x178,%rsp
673         subq    $64,%rdx
674         leaq    -32(%rsi),%rax
675         cmovcq  %rax,%rsi
676
677         vmovdqu 48(%rdi),%xmm14
678         leaq    112(%rdi),%rdi
679         leaq    .Lconst(%rip),%rcx
680
681
682
683         vmovdqu 32(%rsi),%xmm5
684         vmovdqu 48(%rsi),%xmm6
685         vmovdqa 64(%rcx),%xmm15
686
687         vpsrldq $6,%xmm5,%xmm7
688         vpsrldq $6,%xmm6,%xmm8
689         vpunpckhqdq     %xmm6,%xmm5,%xmm9
690         vpunpcklqdq     %xmm6,%xmm5,%xmm5
691         vpunpcklqdq     %xmm8,%xmm7,%xmm8
692
693         vpsrlq  $40,%xmm9,%xmm9
694         vpsrlq  $26,%xmm5,%xmm6
695         vpand   %xmm15,%xmm5,%xmm5
696         vpsrlq  $4,%xmm8,%xmm7
697         vpand   %xmm15,%xmm6,%xmm6
698         vpsrlq  $30,%xmm8,%xmm8
699         vpand   %xmm15,%xmm7,%xmm7
700         vpand   %xmm15,%xmm8,%xmm8
701         vpor    32(%rcx),%xmm9,%xmm9
702
703         jbe     .Lskip_loop_avx
704
705
706         vmovdqu -48(%rdi),%xmm11
707         vmovdqu -32(%rdi),%xmm12
708         vpshufd $0xEE,%xmm14,%xmm13
709         vpshufd $0x44,%xmm14,%xmm10
710         vmovdqa %xmm13,-144(%r11)
711         vmovdqa %xmm10,0(%rsp)
712         vpshufd $0xEE,%xmm11,%xmm14
713         vmovdqu -16(%rdi),%xmm10
714         vpshufd $0x44,%xmm11,%xmm11
715         vmovdqa %xmm14,-128(%r11)
716         vmovdqa %xmm11,16(%rsp)
717         vpshufd $0xEE,%xmm12,%xmm13
718         vmovdqu 0(%rdi),%xmm11
719         vpshufd $0x44,%xmm12,%xmm12
720         vmovdqa %xmm13,-112(%r11)
721         vmovdqa %xmm12,32(%rsp)
722         vpshufd $0xEE,%xmm10,%xmm14
723         vmovdqu 16(%rdi),%xmm12
724         vpshufd $0x44,%xmm10,%xmm10
725         vmovdqa %xmm14,-96(%r11)
726         vmovdqa %xmm10,48(%rsp)
727         vpshufd $0xEE,%xmm11,%xmm13
728         vmovdqu 32(%rdi),%xmm10
729         vpshufd $0x44,%xmm11,%xmm11
730         vmovdqa %xmm13,-80(%r11)
731         vmovdqa %xmm11,64(%rsp)
732         vpshufd $0xEE,%xmm12,%xmm14
733         vmovdqu 48(%rdi),%xmm11
734         vpshufd $0x44,%xmm12,%xmm12
735         vmovdqa %xmm14,-64(%r11)
736         vmovdqa %xmm12,80(%rsp)
737         vpshufd $0xEE,%xmm10,%xmm13
738         vmovdqu 64(%rdi),%xmm12
739         vpshufd $0x44,%xmm10,%xmm10
740         vmovdqa %xmm13,-48(%r11)
741         vmovdqa %xmm10,96(%rsp)
742         vpshufd $0xEE,%xmm11,%xmm14
743         vpshufd $0x44,%xmm11,%xmm11
744         vmovdqa %xmm14,-32(%r11)
745         vmovdqa %xmm11,112(%rsp)
746         vpshufd $0xEE,%xmm12,%xmm13
747         vmovdqa 0(%rsp),%xmm14
748         vpshufd $0x44,%xmm12,%xmm12
749         vmovdqa %xmm13,-16(%r11)
750         vmovdqa %xmm12,128(%rsp)
751
752         jmp     .Loop_avx
753
754 .align  32
755 .Loop_avx:
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776         vpmuludq        %xmm5,%xmm14,%xmm10
777         vpmuludq        %xmm6,%xmm14,%xmm11
778         vmovdqa %xmm2,32(%r11)
779         vpmuludq        %xmm7,%xmm14,%xmm12
780         vmovdqa 16(%rsp),%xmm2
781         vpmuludq        %xmm8,%xmm14,%xmm13
782         vpmuludq        %xmm9,%xmm14,%xmm14
783
784         vmovdqa %xmm0,0(%r11)
785         vpmuludq        32(%rsp),%xmm9,%xmm0
786         vmovdqa %xmm1,16(%r11)
787         vpmuludq        %xmm8,%xmm2,%xmm1
788         vpaddq  %xmm0,%xmm10,%xmm10
789         vpaddq  %xmm1,%xmm14,%xmm14
790         vmovdqa %xmm3,48(%r11)
791         vpmuludq        %xmm7,%xmm2,%xmm0
792         vpmuludq        %xmm6,%xmm2,%xmm1
793         vpaddq  %xmm0,%xmm13,%xmm13
794         vmovdqa 48(%rsp),%xmm3
795         vpaddq  %xmm1,%xmm12,%xmm12
796         vmovdqa %xmm4,64(%r11)
797         vpmuludq        %xmm5,%xmm2,%xmm2
798         vpmuludq        %xmm7,%xmm3,%xmm0
799         vpaddq  %xmm2,%xmm11,%xmm11
800
801         vmovdqa 64(%rsp),%xmm4
802         vpaddq  %xmm0,%xmm14,%xmm14
803         vpmuludq        %xmm6,%xmm3,%xmm1
804         vpmuludq        %xmm5,%xmm3,%xmm3
805         vpaddq  %xmm1,%xmm13,%xmm13
806         vmovdqa 80(%rsp),%xmm2
807         vpaddq  %xmm3,%xmm12,%xmm12
808         vpmuludq        %xmm9,%xmm4,%xmm0
809         vpmuludq        %xmm8,%xmm4,%xmm4
810         vpaddq  %xmm0,%xmm11,%xmm11
811         vmovdqa 96(%rsp),%xmm3
812         vpaddq  %xmm4,%xmm10,%xmm10
813
814         vmovdqa 128(%rsp),%xmm4
815         vpmuludq        %xmm6,%xmm2,%xmm1
816         vpmuludq        %xmm5,%xmm2,%xmm2
817         vpaddq  %xmm1,%xmm14,%xmm14
818         vpaddq  %xmm2,%xmm13,%xmm13
819         vpmuludq        %xmm9,%xmm3,%xmm0
820         vpmuludq        %xmm8,%xmm3,%xmm1
821         vpaddq  %xmm0,%xmm12,%xmm12
822         vmovdqu 0(%rsi),%xmm0
823         vpaddq  %xmm1,%xmm11,%xmm11
824         vpmuludq        %xmm7,%xmm3,%xmm3
825         vpmuludq        %xmm7,%xmm4,%xmm7
826         vpaddq  %xmm3,%xmm10,%xmm10
827
828         vmovdqu 16(%rsi),%xmm1
829         vpaddq  %xmm7,%xmm11,%xmm11
830         vpmuludq        %xmm8,%xmm4,%xmm8
831         vpmuludq        %xmm9,%xmm4,%xmm9
832         vpsrldq $6,%xmm0,%xmm2
833         vpaddq  %xmm8,%xmm12,%xmm12
834         vpaddq  %xmm9,%xmm13,%xmm13
835         vpsrldq $6,%xmm1,%xmm3
836         vpmuludq        112(%rsp),%xmm5,%xmm9
837         vpmuludq        %xmm6,%xmm4,%xmm5
838         vpunpckhqdq     %xmm1,%xmm0,%xmm4
839         vpaddq  %xmm9,%xmm14,%xmm14
840         vmovdqa -144(%r11),%xmm9
841         vpaddq  %xmm5,%xmm10,%xmm10
842
843         vpunpcklqdq     %xmm1,%xmm0,%xmm0
844         vpunpcklqdq     %xmm3,%xmm2,%xmm3
845
846
847         vpsrldq $5,%xmm4,%xmm4
848         vpsrlq  $26,%xmm0,%xmm1
849         vpand   %xmm15,%xmm0,%xmm0
850         vpsrlq  $4,%xmm3,%xmm2
851         vpand   %xmm15,%xmm1,%xmm1
852         vpand   0(%rcx),%xmm4,%xmm4
853         vpsrlq  $30,%xmm3,%xmm3
854         vpand   %xmm15,%xmm2,%xmm2
855         vpand   %xmm15,%xmm3,%xmm3
856         vpor    32(%rcx),%xmm4,%xmm4
857
858         vpaddq  0(%r11),%xmm0,%xmm0
859         vpaddq  16(%r11),%xmm1,%xmm1
860         vpaddq  32(%r11),%xmm2,%xmm2
861         vpaddq  48(%r11),%xmm3,%xmm3
862         vpaddq  64(%r11),%xmm4,%xmm4
863
864         leaq    32(%rsi),%rax
865         leaq    64(%rsi),%rsi
866         subq    $64,%rdx
867         cmovcq  %rax,%rsi
868
869
870
871
872
873
874
875
876
877
878         vpmuludq        %xmm0,%xmm9,%xmm5
879         vpmuludq        %xmm1,%xmm9,%xmm6
880         vpaddq  %xmm5,%xmm10,%xmm10
881         vpaddq  %xmm6,%xmm11,%xmm11
882         vmovdqa -128(%r11),%xmm7
883         vpmuludq        %xmm2,%xmm9,%xmm5
884         vpmuludq        %xmm3,%xmm9,%xmm6
885         vpaddq  %xmm5,%xmm12,%xmm12
886         vpaddq  %xmm6,%xmm13,%xmm13
887         vpmuludq        %xmm4,%xmm9,%xmm9
888         vpmuludq        -112(%r11),%xmm4,%xmm5
889         vpaddq  %xmm9,%xmm14,%xmm14
890
891         vpaddq  %xmm5,%xmm10,%xmm10
892         vpmuludq        %xmm2,%xmm7,%xmm6
893         vpmuludq        %xmm3,%xmm7,%xmm5
894         vpaddq  %xmm6,%xmm13,%xmm13
895         vmovdqa -96(%r11),%xmm8
896         vpaddq  %xmm5,%xmm14,%xmm14
897         vpmuludq        %xmm1,%xmm7,%xmm6
898         vpmuludq        %xmm0,%xmm7,%xmm7
899         vpaddq  %xmm6,%xmm12,%xmm12
900         vpaddq  %xmm7,%xmm11,%xmm11
901
902         vmovdqa -80(%r11),%xmm9
903         vpmuludq        %xmm2,%xmm8,%xmm5
904         vpmuludq        %xmm1,%xmm8,%xmm6
905         vpaddq  %xmm5,%xmm14,%xmm14
906         vpaddq  %xmm6,%xmm13,%xmm13
907         vmovdqa -64(%r11),%xmm7
908         vpmuludq        %xmm0,%xmm8,%xmm8
909         vpmuludq        %xmm4,%xmm9,%xmm5
910         vpaddq  %xmm8,%xmm12,%xmm12
911         vpaddq  %xmm5,%xmm11,%xmm11
912         vmovdqa -48(%r11),%xmm8
913         vpmuludq        %xmm3,%xmm9,%xmm9
914         vpmuludq        %xmm1,%xmm7,%xmm6
915         vpaddq  %xmm9,%xmm10,%xmm10
916
917         vmovdqa -16(%r11),%xmm9
918         vpaddq  %xmm6,%xmm14,%xmm14
919         vpmuludq        %xmm0,%xmm7,%xmm7
920         vpmuludq        %xmm4,%xmm8,%xmm5
921         vpaddq  %xmm7,%xmm13,%xmm13
922         vpaddq  %xmm5,%xmm12,%xmm12
923         vmovdqu 32(%rsi),%xmm5
924         vpmuludq        %xmm3,%xmm8,%xmm7
925         vpmuludq        %xmm2,%xmm8,%xmm8
926         vpaddq  %xmm7,%xmm11,%xmm11
927         vmovdqu 48(%rsi),%xmm6
928         vpaddq  %xmm8,%xmm10,%xmm10
929
930         vpmuludq        %xmm2,%xmm9,%xmm2
931         vpmuludq        %xmm3,%xmm9,%xmm3
932         vpsrldq $6,%xmm5,%xmm7
933         vpaddq  %xmm2,%xmm11,%xmm11
934         vpmuludq        %xmm4,%xmm9,%xmm4
935         vpsrldq $6,%xmm6,%xmm8
936         vpaddq  %xmm3,%xmm12,%xmm2
937         vpaddq  %xmm4,%xmm13,%xmm3
938         vpmuludq        -32(%r11),%xmm0,%xmm4
939         vpmuludq        %xmm1,%xmm9,%xmm0
940         vpunpckhqdq     %xmm6,%xmm5,%xmm9
941         vpaddq  %xmm4,%xmm14,%xmm4
942         vpaddq  %xmm0,%xmm10,%xmm0
943
944         vpunpcklqdq     %xmm6,%xmm5,%xmm5
945         vpunpcklqdq     %xmm8,%xmm7,%xmm8
946
947
948         vpsrldq $5,%xmm9,%xmm9
949         vpsrlq  $26,%xmm5,%xmm6
950         vmovdqa 0(%rsp),%xmm14
951         vpand   %xmm15,%xmm5,%xmm5
952         vpsrlq  $4,%xmm8,%xmm7
953         vpand   %xmm15,%xmm6,%xmm6
954         vpand   0(%rcx),%xmm9,%xmm9
955         vpsrlq  $30,%xmm8,%xmm8
956         vpand   %xmm15,%xmm7,%xmm7
957         vpand   %xmm15,%xmm8,%xmm8
958         vpor    32(%rcx),%xmm9,%xmm9
959
960
961
962
963
964         vpsrlq  $26,%xmm3,%xmm13
965         vpand   %xmm15,%xmm3,%xmm3
966         vpaddq  %xmm13,%xmm4,%xmm4
967
968         vpsrlq  $26,%xmm0,%xmm10
969         vpand   %xmm15,%xmm0,%xmm0
970         vpaddq  %xmm10,%xmm11,%xmm1
971
972         vpsrlq  $26,%xmm4,%xmm10
973         vpand   %xmm15,%xmm4,%xmm4
974
975         vpsrlq  $26,%xmm1,%xmm11
976         vpand   %xmm15,%xmm1,%xmm1
977         vpaddq  %xmm11,%xmm2,%xmm2
978
979         vpaddq  %xmm10,%xmm0,%xmm0
980         vpsllq  $2,%xmm10,%xmm10
981         vpaddq  %xmm10,%xmm0,%xmm0
982
983         vpsrlq  $26,%xmm2,%xmm12
984         vpand   %xmm15,%xmm2,%xmm2
985         vpaddq  %xmm12,%xmm3,%xmm3
986
987         vpsrlq  $26,%xmm0,%xmm10
988         vpand   %xmm15,%xmm0,%xmm0
989         vpaddq  %xmm10,%xmm1,%xmm1
990
991         vpsrlq  $26,%xmm3,%xmm13
992         vpand   %xmm15,%xmm3,%xmm3
993         vpaddq  %xmm13,%xmm4,%xmm4
994
995         ja      .Loop_avx
996
997 .Lskip_loop_avx:
998
999
1000
1001         vpshufd $0x10,%xmm14,%xmm14
1002         addq    $32,%rdx
1003         jnz     .Long_tail_avx
1004
1005         vpaddq  %xmm2,%xmm7,%xmm7
1006         vpaddq  %xmm0,%xmm5,%xmm5
1007         vpaddq  %xmm1,%xmm6,%xmm6
1008         vpaddq  %xmm3,%xmm8,%xmm8
1009         vpaddq  %xmm4,%xmm9,%xmm9
1010
1011 .Long_tail_avx:
1012         vmovdqa %xmm2,32(%r11)
1013         vmovdqa %xmm0,0(%r11)
1014         vmovdqa %xmm1,16(%r11)
1015         vmovdqa %xmm3,48(%r11)
1016         vmovdqa %xmm4,64(%r11)
1017
1018
1019
1020
1021
1022
1023
1024         vpmuludq        %xmm7,%xmm14,%xmm12
1025         vpmuludq        %xmm5,%xmm14,%xmm10
1026         vpshufd $0x10,-48(%rdi),%xmm2
1027         vpmuludq        %xmm6,%xmm14,%xmm11
1028         vpmuludq        %xmm8,%xmm14,%xmm13
1029         vpmuludq        %xmm9,%xmm14,%xmm14
1030
1031         vpmuludq        %xmm8,%xmm2,%xmm0
1032         vpaddq  %xmm0,%xmm14,%xmm14
1033         vpshufd $0x10,-32(%rdi),%xmm3
1034         vpmuludq        %xmm7,%xmm2,%xmm1
1035         vpaddq  %xmm1,%xmm13,%xmm13
1036         vpshufd $0x10,-16(%rdi),%xmm4
1037         vpmuludq        %xmm6,%xmm2,%xmm0
1038         vpaddq  %xmm0,%xmm12,%xmm12
1039         vpmuludq        %xmm5,%xmm2,%xmm2
1040         vpaddq  %xmm2,%xmm11,%xmm11
1041         vpmuludq        %xmm9,%xmm3,%xmm3
1042         vpaddq  %xmm3,%xmm10,%xmm10
1043
1044         vpshufd $0x10,0(%rdi),%xmm2
1045         vpmuludq        %xmm7,%xmm4,%xmm1
1046         vpaddq  %xmm1,%xmm14,%xmm14
1047         vpmuludq        %xmm6,%xmm4,%xmm0
1048         vpaddq  %xmm0,%xmm13,%xmm13
1049         vpshufd $0x10,16(%rdi),%xmm3
1050         vpmuludq        %xmm5,%xmm4,%xmm4
1051         vpaddq  %xmm4,%xmm12,%xmm12
1052         vpmuludq        %xmm9,%xmm2,%xmm1
1053         vpaddq  %xmm1,%xmm11,%xmm11
1054         vpshufd $0x10,32(%rdi),%xmm4
1055         vpmuludq        %xmm8,%xmm2,%xmm2
1056         vpaddq  %xmm2,%xmm10,%xmm10
1057
1058         vpmuludq        %xmm6,%xmm3,%xmm0
1059         vpaddq  %xmm0,%xmm14,%xmm14
1060         vpmuludq        %xmm5,%xmm3,%xmm3
1061         vpaddq  %xmm3,%xmm13,%xmm13
1062         vpshufd $0x10,48(%rdi),%xmm2
1063         vpmuludq        %xmm9,%xmm4,%xmm1
1064         vpaddq  %xmm1,%xmm12,%xmm12
1065         vpshufd $0x10,64(%rdi),%xmm3
1066         vpmuludq        %xmm8,%xmm4,%xmm0
1067         vpaddq  %xmm0,%xmm11,%xmm11
1068         vpmuludq        %xmm7,%xmm4,%xmm4
1069         vpaddq  %xmm4,%xmm10,%xmm10
1070
1071         vpmuludq        %xmm5,%xmm2,%xmm2
1072         vpaddq  %xmm2,%xmm14,%xmm14
1073         vpmuludq        %xmm9,%xmm3,%xmm1
1074         vpaddq  %xmm1,%xmm13,%xmm13
1075         vpmuludq        %xmm8,%xmm3,%xmm0
1076         vpaddq  %xmm0,%xmm12,%xmm12
1077         vpmuludq        %xmm7,%xmm3,%xmm1
1078         vpaddq  %xmm1,%xmm11,%xmm11
1079         vpmuludq        %xmm6,%xmm3,%xmm3
1080         vpaddq  %xmm3,%xmm10,%xmm10
1081
1082         jz      .Lshort_tail_avx
1083
1084         vmovdqu 0(%rsi),%xmm0
1085         vmovdqu 16(%rsi),%xmm1
1086
1087         vpsrldq $6,%xmm0,%xmm2
1088         vpsrldq $6,%xmm1,%xmm3
1089         vpunpckhqdq     %xmm1,%xmm0,%xmm4
1090         vpunpcklqdq     %xmm1,%xmm0,%xmm0
1091         vpunpcklqdq     %xmm3,%xmm2,%xmm3
1092
1093         vpsrlq  $40,%xmm4,%xmm4
1094         vpsrlq  $26,%xmm0,%xmm1
1095         vpand   %xmm15,%xmm0,%xmm0
1096         vpsrlq  $4,%xmm3,%xmm2
1097         vpand   %xmm15,%xmm1,%xmm1
1098         vpsrlq  $30,%xmm3,%xmm3
1099         vpand   %xmm15,%xmm2,%xmm2
1100         vpand   %xmm15,%xmm3,%xmm3
1101         vpor    32(%rcx),%xmm4,%xmm4
1102
1103         vpshufd $0x32,-64(%rdi),%xmm9
1104         vpaddq  0(%r11),%xmm0,%xmm0
1105         vpaddq  16(%r11),%xmm1,%xmm1
1106         vpaddq  32(%r11),%xmm2,%xmm2
1107         vpaddq  48(%r11),%xmm3,%xmm3
1108         vpaddq  64(%r11),%xmm4,%xmm4
1109
1110
1111
1112
1113         vpmuludq        %xmm0,%xmm9,%xmm5
1114         vpaddq  %xmm5,%xmm10,%xmm10
1115         vpmuludq        %xmm1,%xmm9,%xmm6
1116         vpaddq  %xmm6,%xmm11,%xmm11
1117         vpmuludq        %xmm2,%xmm9,%xmm5
1118         vpaddq  %xmm5,%xmm12,%xmm12
1119         vpshufd $0x32,-48(%rdi),%xmm7
1120         vpmuludq        %xmm3,%xmm9,%xmm6
1121         vpaddq  %xmm6,%xmm13,%xmm13
1122         vpmuludq        %xmm4,%xmm9,%xmm9
1123         vpaddq  %xmm9,%xmm14,%xmm14
1124
1125         vpmuludq        %xmm3,%xmm7,%xmm5
1126         vpaddq  %xmm5,%xmm14,%xmm14
1127         vpshufd $0x32,-32(%rdi),%xmm8
1128         vpmuludq        %xmm2,%xmm7,%xmm6
1129         vpaddq  %xmm6,%xmm13,%xmm13
1130         vpshufd $0x32,-16(%rdi),%xmm9
1131         vpmuludq        %xmm1,%xmm7,%xmm5
1132         vpaddq  %xmm5,%xmm12,%xmm12
1133         vpmuludq        %xmm0,%xmm7,%xmm7
1134         vpaddq  %xmm7,%xmm11,%xmm11
1135         vpmuludq        %xmm4,%xmm8,%xmm8
1136         vpaddq  %xmm8,%xmm10,%xmm10
1137
1138         vpshufd $0x32,0(%rdi),%xmm7
1139         vpmuludq        %xmm2,%xmm9,%xmm6
1140         vpaddq  %xmm6,%xmm14,%xmm14
1141         vpmuludq        %xmm1,%xmm9,%xmm5
1142         vpaddq  %xmm5,%xmm13,%xmm13
1143         vpshufd $0x32,16(%rdi),%xmm8
1144         vpmuludq        %xmm0,%xmm9,%xmm9
1145         vpaddq  %xmm9,%xmm12,%xmm12
1146         vpmuludq        %xmm4,%xmm7,%xmm6
1147         vpaddq  %xmm6,%xmm11,%xmm11
1148         vpshufd $0x32,32(%rdi),%xmm9
1149         vpmuludq        %xmm3,%xmm7,%xmm7
1150         vpaddq  %xmm7,%xmm10,%xmm10
1151
1152         vpmuludq        %xmm1,%xmm8,%xmm5
1153         vpaddq  %xmm5,%xmm14,%xmm14
1154         vpmuludq        %xmm0,%xmm8,%xmm8
1155         vpaddq  %xmm8,%xmm13,%xmm13
1156         vpshufd $0x32,48(%rdi),%xmm7
1157         vpmuludq        %xmm4,%xmm9,%xmm6
1158         vpaddq  %xmm6,%xmm12,%xmm12
1159         vpshufd $0x32,64(%rdi),%xmm8
1160         vpmuludq        %xmm3,%xmm9,%xmm5
1161         vpaddq  %xmm5,%xmm11,%xmm11
1162         vpmuludq        %xmm2,%xmm9,%xmm9
1163         vpaddq  %xmm9,%xmm10,%xmm10
1164
1165         vpmuludq        %xmm0,%xmm7,%xmm7
1166         vpaddq  %xmm7,%xmm14,%xmm14
1167         vpmuludq        %xmm4,%xmm8,%xmm6
1168         vpaddq  %xmm6,%xmm13,%xmm13
1169         vpmuludq        %xmm3,%xmm8,%xmm5
1170         vpaddq  %xmm5,%xmm12,%xmm12
1171         vpmuludq        %xmm2,%xmm8,%xmm6
1172         vpaddq  %xmm6,%xmm11,%xmm11
1173         vpmuludq        %xmm1,%xmm8,%xmm8
1174         vpaddq  %xmm8,%xmm10,%xmm10
1175
1176 .Lshort_tail_avx:
1177
1178
1179
1180         vpsrldq $8,%xmm14,%xmm9
1181         vpsrldq $8,%xmm13,%xmm8
1182         vpsrldq $8,%xmm11,%xmm6
1183         vpsrldq $8,%xmm10,%xmm5
1184         vpsrldq $8,%xmm12,%xmm7
1185         vpaddq  %xmm8,%xmm13,%xmm13
1186         vpaddq  %xmm9,%xmm14,%xmm14
1187         vpaddq  %xmm5,%xmm10,%xmm10
1188         vpaddq  %xmm6,%xmm11,%xmm11
1189         vpaddq  %xmm7,%xmm12,%xmm12
1190
1191
1192
1193
1194         vpsrlq  $26,%xmm13,%xmm3
1195         vpand   %xmm15,%xmm13,%xmm13
1196         vpaddq  %xmm3,%xmm14,%xmm14
1197
1198         vpsrlq  $26,%xmm10,%xmm0
1199         vpand   %xmm15,%xmm10,%xmm10
1200         vpaddq  %xmm0,%xmm11,%xmm11
1201
1202         vpsrlq  $26,%xmm14,%xmm4
1203         vpand   %xmm15,%xmm14,%xmm14
1204
1205         vpsrlq  $26,%xmm11,%xmm1
1206         vpand   %xmm15,%xmm11,%xmm11
1207         vpaddq  %xmm1,%xmm12,%xmm12
1208
1209         vpaddq  %xmm4,%xmm10,%xmm10
1210         vpsllq  $2,%xmm4,%xmm4
1211         vpaddq  %xmm4,%xmm10,%xmm10
1212
1213         vpsrlq  $26,%xmm12,%xmm2
1214         vpand   %xmm15,%xmm12,%xmm12
1215         vpaddq  %xmm2,%xmm13,%xmm13
1216
1217         vpsrlq  $26,%xmm10,%xmm0
1218         vpand   %xmm15,%xmm10,%xmm10
1219         vpaddq  %xmm0,%xmm11,%xmm11
1220
1221         vpsrlq  $26,%xmm13,%xmm3
1222         vpand   %xmm15,%xmm13,%xmm13
1223         vpaddq  %xmm3,%xmm14,%xmm14
1224
1225         vmovd   %xmm10,-112(%rdi)
1226         vmovd   %xmm11,-108(%rdi)
1227         vmovd   %xmm12,-104(%rdi)
1228         vmovd   %xmm13,-100(%rdi)
1229         vmovd   %xmm14,-96(%rdi)
1230         leaq    88(%r11),%rsp
1231 .cfi_def_cfa    %rsp,8
1232         vzeroupper
1233         .byte   0xf3,0xc3
1234 .cfi_endproc    
1235 .size   poly1305_blocks_avx,.-poly1305_blocks_avx
1236
1237 .type   poly1305_emit_avx,@function
1238 .align  32
1239 poly1305_emit_avx:
1240 .cfi_startproc  
1241         cmpl    $0,20(%rdi)
1242         je      .Lemit
1243
1244         movl    0(%rdi),%eax
1245         movl    4(%rdi),%ecx
1246         movl    8(%rdi),%r8d
1247         movl    12(%rdi),%r11d
1248         movl    16(%rdi),%r10d
1249
1250         shlq    $26,%rcx
1251         movq    %r8,%r9
1252         shlq    $52,%r8
1253         addq    %rcx,%rax
1254         shrq    $12,%r9
1255         addq    %rax,%r8
1256         adcq    $0,%r9
1257
1258         shlq    $14,%r11
1259         movq    %r10,%rax
1260         shrq    $24,%r10
1261         addq    %r11,%r9
1262         shlq    $40,%rax
1263         addq    %rax,%r9
1264         adcq    $0,%r10
1265
1266         movq    %r10,%rax
1267         movq    %r10,%rcx
1268         andq    $3,%r10
1269         shrq    $2,%rax
1270         andq    $-4,%rcx
1271         addq    %rcx,%rax
1272         addq    %rax,%r8
1273         adcq    $0,%r9
1274         adcq    $0,%r10
1275
1276         movq    %r8,%rax
1277         addq    $5,%r8
1278         movq    %r9,%rcx
1279         adcq    $0,%r9
1280         adcq    $0,%r10
1281         shrq    $2,%r10
1282         cmovnzq %r8,%rax
1283         cmovnzq %r9,%rcx
1284
1285         addq    0(%rdx),%rax
1286         adcq    8(%rdx),%rcx
1287         movq    %rax,0(%rsi)
1288         movq    %rcx,8(%rsi)
1289
1290         .byte   0xf3,0xc3
1291 .cfi_endproc    
1292 .size   poly1305_emit_avx,.-poly1305_emit_avx
1293 .type   poly1305_blocks_avx2,@function
1294 .align  32
1295 poly1305_blocks_avx2:
1296 .cfi_startproc  
1297         movl    20(%rdi),%r8d
1298         cmpq    $128,%rdx
1299         jae     .Lblocks_avx2
1300         testl   %r8d,%r8d
1301         jz      .Lblocks
1302
1303 .Lblocks_avx2:
1304         andq    $-16,%rdx
1305         jz      .Lno_data_avx2
1306
1307         vzeroupper
1308
1309         testl   %r8d,%r8d
1310         jz      .Lbase2_64_avx2
1311
1312         testq   $63,%rdx
1313         jz      .Leven_avx2
1314
1315         pushq   %rbx
1316 .cfi_adjust_cfa_offset  8
1317 .cfi_offset     %rbx,-16
1318         pushq   %rbp
1319 .cfi_adjust_cfa_offset  8
1320 .cfi_offset     %rbp,-24
1321         pushq   %r12
1322 .cfi_adjust_cfa_offset  8
1323 .cfi_offset     %r12,-32
1324         pushq   %r13
1325 .cfi_adjust_cfa_offset  8
1326 .cfi_offset     %r13,-40
1327         pushq   %r14
1328 .cfi_adjust_cfa_offset  8
1329 .cfi_offset     %r14,-48
1330         pushq   %r15
1331 .cfi_adjust_cfa_offset  8
1332 .cfi_offset     %r15,-56
1333 .Lblocks_avx2_body:
1334
1335         movq    %rdx,%r15
1336
1337         movq    0(%rdi),%r8
1338         movq    8(%rdi),%r9
1339         movl    16(%rdi),%ebp
1340
1341         movq    24(%rdi),%r11
1342         movq    32(%rdi),%r13
1343
1344
1345         movl    %r8d,%r14d
1346         andq    $-2147483648,%r8
1347         movq    %r9,%r12
1348         movl    %r9d,%ebx
1349         andq    $-2147483648,%r9
1350
1351         shrq    $6,%r8
1352         shlq    $52,%r12
1353         addq    %r8,%r14
1354         shrq    $12,%rbx
1355         shrq    $18,%r9
1356         addq    %r12,%r14
1357         adcq    %r9,%rbx
1358
1359         movq    %rbp,%r8
1360         shlq    $40,%r8
1361         shrq    $24,%rbp
1362         addq    %r8,%rbx
1363         adcq    $0,%rbp
1364
1365         movq    $-4,%r9
1366         movq    %rbp,%r8
1367         andq    %rbp,%r9
1368         shrq    $2,%r8
1369         andq    $3,%rbp
1370         addq    %r9,%r8
1371         addq    %r8,%r14
1372         adcq    $0,%rbx
1373         adcq    $0,%rbp
1374
1375         movq    %r13,%r12
1376         movq    %r13,%rax
1377         shrq    $2,%r13
1378         addq    %r12,%r13
1379
1380 .Lbase2_26_pre_avx2:
1381         addq    0(%rsi),%r14
1382         adcq    8(%rsi),%rbx
1383         leaq    16(%rsi),%rsi
1384         adcq    %rcx,%rbp
1385         subq    $16,%r15
1386
1387         call    __poly1305_block
1388         movq    %r12,%rax
1389
1390         testq   $63,%r15
1391         jnz     .Lbase2_26_pre_avx2
1392
1393         testq   %rcx,%rcx
1394         jz      .Lstore_base2_64_avx2
1395
1396
1397         movq    %r14,%rax
1398         movq    %r14,%rdx
1399         shrq    $52,%r14
1400         movq    %rbx,%r11
1401         movq    %rbx,%r12
1402         shrq    $26,%rdx
1403         andq    $0x3ffffff,%rax
1404         shlq    $12,%r11
1405         andq    $0x3ffffff,%rdx
1406         shrq    $14,%rbx
1407         orq     %r11,%r14
1408         shlq    $24,%rbp
1409         andq    $0x3ffffff,%r14
1410         shrq    $40,%r12
1411         andq    $0x3ffffff,%rbx
1412         orq     %r12,%rbp
1413
1414         testq   %r15,%r15
1415         jz      .Lstore_base2_26_avx2
1416
1417         vmovd   %eax,%xmm0
1418         vmovd   %edx,%xmm1
1419         vmovd   %r14d,%xmm2
1420         vmovd   %ebx,%xmm3
1421         vmovd   %ebp,%xmm4
1422         jmp     .Lproceed_avx2
1423
1424 .align  32
1425 .Lstore_base2_64_avx2:
1426         movq    %r14,0(%rdi)
1427         movq    %rbx,8(%rdi)
1428         movq    %rbp,16(%rdi)
1429         jmp     .Ldone_avx2
1430
1431 .align  16
1432 .Lstore_base2_26_avx2:
1433         movl    %eax,0(%rdi)
1434         movl    %edx,4(%rdi)
1435         movl    %r14d,8(%rdi)
1436         movl    %ebx,12(%rdi)
1437         movl    %ebp,16(%rdi)
1438 .align  16
1439 .Ldone_avx2:
1440         movq    0(%rsp),%r15
1441 .cfi_restore    %r15
1442         movq    8(%rsp),%r14
1443 .cfi_restore    %r14
1444         movq    16(%rsp),%r13
1445 .cfi_restore    %r13
1446         movq    24(%rsp),%r12
1447 .cfi_restore    %r12
1448         movq    32(%rsp),%rbp
1449 .cfi_restore    %rbp
1450         movq    40(%rsp),%rbx
1451 .cfi_restore    %rbx
1452         leaq    48(%rsp),%rsp
1453 .cfi_adjust_cfa_offset  -48
1454 .Lno_data_avx2:
1455 .Lblocks_avx2_epilogue:
1456         .byte   0xf3,0xc3
1457 .cfi_endproc    
1458
1459 .align  32
1460 .Lbase2_64_avx2:
1461 .cfi_startproc  
1462         pushq   %rbx
1463 .cfi_adjust_cfa_offset  8
1464 .cfi_offset     %rbx,-16
1465         pushq   %rbp
1466 .cfi_adjust_cfa_offset  8
1467 .cfi_offset     %rbp,-24
1468         pushq   %r12
1469 .cfi_adjust_cfa_offset  8
1470 .cfi_offset     %r12,-32
1471         pushq   %r13
1472 .cfi_adjust_cfa_offset  8
1473 .cfi_offset     %r13,-40
1474         pushq   %r14
1475 .cfi_adjust_cfa_offset  8
1476 .cfi_offset     %r14,-48
1477         pushq   %r15
1478 .cfi_adjust_cfa_offset  8
1479 .cfi_offset     %r15,-56
1480 .Lbase2_64_avx2_body:
1481
1482         movq    %rdx,%r15
1483
1484         movq    24(%rdi),%r11
1485         movq    32(%rdi),%r13
1486
1487         movq    0(%rdi),%r14
1488         movq    8(%rdi),%rbx
1489         movl    16(%rdi),%ebp
1490
1491         movq    %r13,%r12
1492         movq    %r13,%rax
1493         shrq    $2,%r13
1494         addq    %r12,%r13
1495
1496         testq   $63,%rdx
1497         jz      .Linit_avx2
1498
1499 .Lbase2_64_pre_avx2:
1500         addq    0(%rsi),%r14
1501         adcq    8(%rsi),%rbx
1502         leaq    16(%rsi),%rsi
1503         adcq    %rcx,%rbp
1504         subq    $16,%r15
1505
1506         call    __poly1305_block
1507         movq    %r12,%rax
1508
1509         testq   $63,%r15
1510         jnz     .Lbase2_64_pre_avx2
1511
1512 .Linit_avx2:
1513
1514         movq    %r14,%rax
1515         movq    %r14,%rdx
1516         shrq    $52,%r14
1517         movq    %rbx,%r8
1518         movq    %rbx,%r9
1519         shrq    $26,%rdx
1520         andq    $0x3ffffff,%rax
1521         shlq    $12,%r8
1522         andq    $0x3ffffff,%rdx
1523         shrq    $14,%rbx
1524         orq     %r8,%r14
1525         shlq    $24,%rbp
1526         andq    $0x3ffffff,%r14
1527         shrq    $40,%r9
1528         andq    $0x3ffffff,%rbx
1529         orq     %r9,%rbp
1530
1531         vmovd   %eax,%xmm0
1532         vmovd   %edx,%xmm1
1533         vmovd   %r14d,%xmm2
1534         vmovd   %ebx,%xmm3
1535         vmovd   %ebp,%xmm4
1536         movl    $1,20(%rdi)
1537
1538         call    __poly1305_init_avx
1539
1540 .Lproceed_avx2:
1541         movq    %r15,%rdx
1542         movl    OPENSSL_ia32cap_P+8(%rip),%r10d
1543         movl    $3221291008,%r11d
1544
1545         movq    0(%rsp),%r15
1546 .cfi_restore    %r15
1547         movq    8(%rsp),%r14
1548 .cfi_restore    %r14
1549         movq    16(%rsp),%r13
1550 .cfi_restore    %r13
1551         movq    24(%rsp),%r12
1552 .cfi_restore    %r12
1553         movq    32(%rsp),%rbp
1554 .cfi_restore    %rbp
1555         movq    40(%rsp),%rbx
1556 .cfi_restore    %rbx
1557         leaq    48(%rsp),%rax
1558         leaq    48(%rsp),%rsp
1559 .cfi_adjust_cfa_offset  -48
1560 .Lbase2_64_avx2_epilogue:
1561         jmp     .Ldo_avx2
1562 .cfi_endproc    
1563
1564 .align  32
1565 .Leven_avx2:
1566 .cfi_startproc  
1567         movl    OPENSSL_ia32cap_P+8(%rip),%r10d
1568         vmovd   0(%rdi),%xmm0
1569         vmovd   4(%rdi),%xmm1
1570         vmovd   8(%rdi),%xmm2
1571         vmovd   12(%rdi),%xmm3
1572         vmovd   16(%rdi),%xmm4
1573
1574 .Ldo_avx2:
1575         leaq    -8(%rsp),%r11
1576 .cfi_def_cfa    %r11,16
1577         subq    $0x128,%rsp
1578         leaq    .Lconst(%rip),%rcx
1579         leaq    48+64(%rdi),%rdi
1580         vmovdqa 96(%rcx),%ymm7
1581
1582
1583         vmovdqu -64(%rdi),%xmm9
1584         andq    $-512,%rsp
1585         vmovdqu -48(%rdi),%xmm10
1586         vmovdqu -32(%rdi),%xmm6
1587         vmovdqu -16(%rdi),%xmm11
1588         vmovdqu 0(%rdi),%xmm12
1589         vmovdqu 16(%rdi),%xmm13
1590         leaq    144(%rsp),%rax
1591         vmovdqu 32(%rdi),%xmm14
1592         vpermd  %ymm9,%ymm7,%ymm9
1593         vmovdqu 48(%rdi),%xmm15
1594         vpermd  %ymm10,%ymm7,%ymm10
1595         vmovdqu 64(%rdi),%xmm5
1596         vpermd  %ymm6,%ymm7,%ymm6
1597         vmovdqa %ymm9,0(%rsp)
1598         vpermd  %ymm11,%ymm7,%ymm11
1599         vmovdqa %ymm10,32-144(%rax)
1600         vpermd  %ymm12,%ymm7,%ymm12
1601         vmovdqa %ymm6,64-144(%rax)
1602         vpermd  %ymm13,%ymm7,%ymm13
1603         vmovdqa %ymm11,96-144(%rax)
1604         vpermd  %ymm14,%ymm7,%ymm14
1605         vmovdqa %ymm12,128-144(%rax)
1606         vpermd  %ymm15,%ymm7,%ymm15
1607         vmovdqa %ymm13,160-144(%rax)
1608         vpermd  %ymm5,%ymm7,%ymm5
1609         vmovdqa %ymm14,192-144(%rax)
1610         vmovdqa %ymm15,224-144(%rax)
1611         vmovdqa %ymm5,256-144(%rax)
1612         vmovdqa 64(%rcx),%ymm5
1613
1614
1615
1616         vmovdqu 0(%rsi),%xmm7
1617         vmovdqu 16(%rsi),%xmm8
1618         vinserti128     $1,32(%rsi),%ymm7,%ymm7
1619         vinserti128     $1,48(%rsi),%ymm8,%ymm8
1620         leaq    64(%rsi),%rsi
1621
1622         vpsrldq $6,%ymm7,%ymm9
1623         vpsrldq $6,%ymm8,%ymm10
1624         vpunpckhqdq     %ymm8,%ymm7,%ymm6
1625         vpunpcklqdq     %ymm10,%ymm9,%ymm9
1626         vpunpcklqdq     %ymm8,%ymm7,%ymm7
1627
1628         vpsrlq  $30,%ymm9,%ymm10
1629         vpsrlq  $4,%ymm9,%ymm9
1630         vpsrlq  $26,%ymm7,%ymm8
1631         vpsrlq  $40,%ymm6,%ymm6
1632         vpand   %ymm5,%ymm9,%ymm9
1633         vpand   %ymm5,%ymm7,%ymm7
1634         vpand   %ymm5,%ymm8,%ymm8
1635         vpand   %ymm5,%ymm10,%ymm10
1636         vpor    32(%rcx),%ymm6,%ymm6
1637
1638         vpaddq  %ymm2,%ymm9,%ymm2
1639         subq    $64,%rdx
1640         jz      .Ltail_avx2
1641         jmp     .Loop_avx2
1642
1643 .align  32
1644 .Loop_avx2:
1645
1646
1647
1648
1649
1650
1651
1652
1653         vpaddq  %ymm0,%ymm7,%ymm0
1654         vmovdqa 0(%rsp),%ymm7
1655         vpaddq  %ymm1,%ymm8,%ymm1
1656         vmovdqa 32(%rsp),%ymm8
1657         vpaddq  %ymm3,%ymm10,%ymm3
1658         vmovdqa 96(%rsp),%ymm9
1659         vpaddq  %ymm4,%ymm6,%ymm4
1660         vmovdqa 48(%rax),%ymm10
1661         vmovdqa 112(%rax),%ymm5
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678         vpmuludq        %ymm2,%ymm7,%ymm13
1679         vpmuludq        %ymm2,%ymm8,%ymm14
1680         vpmuludq        %ymm2,%ymm9,%ymm15
1681         vpmuludq        %ymm2,%ymm10,%ymm11
1682         vpmuludq        %ymm2,%ymm5,%ymm12
1683
1684         vpmuludq        %ymm0,%ymm8,%ymm6
1685         vpmuludq        %ymm1,%ymm8,%ymm2
1686         vpaddq  %ymm6,%ymm12,%ymm12
1687         vpaddq  %ymm2,%ymm13,%ymm13
1688         vpmuludq        %ymm3,%ymm8,%ymm6
1689         vpmuludq        64(%rsp),%ymm4,%ymm2
1690         vpaddq  %ymm6,%ymm15,%ymm15
1691         vpaddq  %ymm2,%ymm11,%ymm11
1692         vmovdqa -16(%rax),%ymm8
1693
1694         vpmuludq        %ymm0,%ymm7,%ymm6
1695         vpmuludq        %ymm1,%ymm7,%ymm2
1696         vpaddq  %ymm6,%ymm11,%ymm11
1697         vpaddq  %ymm2,%ymm12,%ymm12
1698         vpmuludq        %ymm3,%ymm7,%ymm6
1699         vpmuludq        %ymm4,%ymm7,%ymm2
1700         vmovdqu 0(%rsi),%xmm7
1701         vpaddq  %ymm6,%ymm14,%ymm14
1702         vpaddq  %ymm2,%ymm15,%ymm15
1703         vinserti128     $1,32(%rsi),%ymm7,%ymm7
1704
1705         vpmuludq        %ymm3,%ymm8,%ymm6
1706         vpmuludq        %ymm4,%ymm8,%ymm2
1707         vmovdqu 16(%rsi),%xmm8
1708         vpaddq  %ymm6,%ymm11,%ymm11
1709         vpaddq  %ymm2,%ymm12,%ymm12
1710         vmovdqa 16(%rax),%ymm2
1711         vpmuludq        %ymm1,%ymm9,%ymm6
1712         vpmuludq        %ymm0,%ymm9,%ymm9
1713         vpaddq  %ymm6,%ymm14,%ymm14
1714         vpaddq  %ymm9,%ymm13,%ymm13
1715         vinserti128     $1,48(%rsi),%ymm8,%ymm8
1716         leaq    64(%rsi),%rsi
1717
1718         vpmuludq        %ymm1,%ymm2,%ymm6
1719         vpmuludq        %ymm0,%ymm2,%ymm2
1720         vpsrldq $6,%ymm7,%ymm9
1721         vpaddq  %ymm6,%ymm15,%ymm15
1722         vpaddq  %ymm2,%ymm14,%ymm14
1723         vpmuludq        %ymm3,%ymm10,%ymm6
1724         vpmuludq        %ymm4,%ymm10,%ymm2
1725         vpsrldq $6,%ymm8,%ymm10
1726         vpaddq  %ymm6,%ymm12,%ymm12
1727         vpaddq  %ymm2,%ymm13,%ymm13
1728         vpunpckhqdq     %ymm8,%ymm7,%ymm6
1729
1730         vpmuludq        %ymm3,%ymm5,%ymm3
1731         vpmuludq        %ymm4,%ymm5,%ymm4
1732         vpunpcklqdq     %ymm8,%ymm7,%ymm7
1733         vpaddq  %ymm3,%ymm13,%ymm2
1734         vpaddq  %ymm4,%ymm14,%ymm3
1735         vpunpcklqdq     %ymm10,%ymm9,%ymm10
1736         vpmuludq        80(%rax),%ymm0,%ymm4
1737         vpmuludq        %ymm1,%ymm5,%ymm0
1738         vmovdqa 64(%rcx),%ymm5
1739         vpaddq  %ymm4,%ymm15,%ymm4
1740         vpaddq  %ymm0,%ymm11,%ymm0
1741
1742
1743
1744
1745         vpsrlq  $26,%ymm3,%ymm14
1746         vpand   %ymm5,%ymm3,%ymm3
1747         vpaddq  %ymm14,%ymm4,%ymm4
1748
1749         vpsrlq  $26,%ymm0,%ymm11
1750         vpand   %ymm5,%ymm0,%ymm0
1751         vpaddq  %ymm11,%ymm12,%ymm1
1752
1753         vpsrlq  $26,%ymm4,%ymm15
1754         vpand   %ymm5,%ymm4,%ymm4
1755
1756         vpsrlq  $4,%ymm10,%ymm9
1757
1758         vpsrlq  $26,%ymm1,%ymm12
1759         vpand   %ymm5,%ymm1,%ymm1
1760         vpaddq  %ymm12,%ymm2,%ymm2
1761
1762         vpaddq  %ymm15,%ymm0,%ymm0
1763         vpsllq  $2,%ymm15,%ymm15
1764         vpaddq  %ymm15,%ymm0,%ymm0
1765
1766         vpand   %ymm5,%ymm9,%ymm9
1767         vpsrlq  $26,%ymm7,%ymm8
1768
1769         vpsrlq  $26,%ymm2,%ymm13
1770         vpand   %ymm5,%ymm2,%ymm2
1771         vpaddq  %ymm13,%ymm3,%ymm3
1772
1773         vpaddq  %ymm9,%ymm2,%ymm2
1774         vpsrlq  $30,%ymm10,%ymm10
1775
1776         vpsrlq  $26,%ymm0,%ymm11
1777         vpand   %ymm5,%ymm0,%ymm0
1778         vpaddq  %ymm11,%ymm1,%ymm1
1779
1780         vpsrlq  $40,%ymm6,%ymm6
1781
1782         vpsrlq  $26,%ymm3,%ymm14
1783         vpand   %ymm5,%ymm3,%ymm3
1784         vpaddq  %ymm14,%ymm4,%ymm4
1785
1786         vpand   %ymm5,%ymm7,%ymm7
1787         vpand   %ymm5,%ymm8,%ymm8
1788         vpand   %ymm5,%ymm10,%ymm10
1789         vpor    32(%rcx),%ymm6,%ymm6
1790
1791         subq    $64,%rdx
1792         jnz     .Loop_avx2
1793
1794 .byte   0x66,0x90
1795 .Ltail_avx2:
1796
1797
1798
1799
1800
1801
1802
1803         vpaddq  %ymm0,%ymm7,%ymm0
1804         vmovdqu 4(%rsp),%ymm7
1805         vpaddq  %ymm1,%ymm8,%ymm1
1806         vmovdqu 36(%rsp),%ymm8
1807         vpaddq  %ymm3,%ymm10,%ymm3
1808         vmovdqu 100(%rsp),%ymm9
1809         vpaddq  %ymm4,%ymm6,%ymm4
1810         vmovdqu 52(%rax),%ymm10
1811         vmovdqu 116(%rax),%ymm5
1812
1813         vpmuludq        %ymm2,%ymm7,%ymm13
1814         vpmuludq        %ymm2,%ymm8,%ymm14
1815         vpmuludq        %ymm2,%ymm9,%ymm15
1816         vpmuludq        %ymm2,%ymm10,%ymm11
1817         vpmuludq        %ymm2,%ymm5,%ymm12
1818
1819         vpmuludq        %ymm0,%ymm8,%ymm6
1820         vpmuludq        %ymm1,%ymm8,%ymm2
1821         vpaddq  %ymm6,%ymm12,%ymm12
1822         vpaddq  %ymm2,%ymm13,%ymm13
1823         vpmuludq        %ymm3,%ymm8,%ymm6
1824         vpmuludq        68(%rsp),%ymm4,%ymm2
1825         vpaddq  %ymm6,%ymm15,%ymm15
1826         vpaddq  %ymm2,%ymm11,%ymm11
1827
1828         vpmuludq        %ymm0,%ymm7,%ymm6
1829         vpmuludq        %ymm1,%ymm7,%ymm2
1830         vpaddq  %ymm6,%ymm11,%ymm11
1831         vmovdqu -12(%rax),%ymm8
1832         vpaddq  %ymm2,%ymm12,%ymm12
1833         vpmuludq        %ymm3,%ymm7,%ymm6
1834         vpmuludq        %ymm4,%ymm7,%ymm2
1835         vpaddq  %ymm6,%ymm14,%ymm14
1836         vpaddq  %ymm2,%ymm15,%ymm15
1837
1838         vpmuludq        %ymm3,%ymm8,%ymm6
1839         vpmuludq        %ymm4,%ymm8,%ymm2
1840         vpaddq  %ymm6,%ymm11,%ymm11
1841         vpaddq  %ymm2,%ymm12,%ymm12
1842         vmovdqu 20(%rax),%ymm2
1843         vpmuludq        %ymm1,%ymm9,%ymm6
1844         vpmuludq        %ymm0,%ymm9,%ymm9
1845         vpaddq  %ymm6,%ymm14,%ymm14
1846         vpaddq  %ymm9,%ymm13,%ymm13
1847
1848         vpmuludq        %ymm1,%ymm2,%ymm6
1849         vpmuludq        %ymm0,%ymm2,%ymm2
1850         vpaddq  %ymm6,%ymm15,%ymm15
1851         vpaddq  %ymm2,%ymm14,%ymm14
1852         vpmuludq        %ymm3,%ymm10,%ymm6
1853         vpmuludq        %ymm4,%ymm10,%ymm2
1854         vpaddq  %ymm6,%ymm12,%ymm12
1855         vpaddq  %ymm2,%ymm13,%ymm13
1856
1857         vpmuludq        %ymm3,%ymm5,%ymm3
1858         vpmuludq        %ymm4,%ymm5,%ymm4
1859         vpaddq  %ymm3,%ymm13,%ymm2
1860         vpaddq  %ymm4,%ymm14,%ymm3
1861         vpmuludq        84(%rax),%ymm0,%ymm4
1862         vpmuludq        %ymm1,%ymm5,%ymm0
1863         vmovdqa 64(%rcx),%ymm5
1864         vpaddq  %ymm4,%ymm15,%ymm4
1865         vpaddq  %ymm0,%ymm11,%ymm0
1866
1867
1868
1869
1870         vpsrldq $8,%ymm12,%ymm8
1871         vpsrldq $8,%ymm2,%ymm9
1872         vpsrldq $8,%ymm3,%ymm10
1873         vpsrldq $8,%ymm4,%ymm6
1874         vpsrldq $8,%ymm0,%ymm7
1875         vpaddq  %ymm8,%ymm12,%ymm12
1876         vpaddq  %ymm9,%ymm2,%ymm2
1877         vpaddq  %ymm10,%ymm3,%ymm3
1878         vpaddq  %ymm6,%ymm4,%ymm4
1879         vpaddq  %ymm7,%ymm0,%ymm0
1880
1881         vpermq  $0x2,%ymm3,%ymm10
1882         vpermq  $0x2,%ymm4,%ymm6
1883         vpermq  $0x2,%ymm0,%ymm7
1884         vpermq  $0x2,%ymm12,%ymm8
1885         vpermq  $0x2,%ymm2,%ymm9
1886         vpaddq  %ymm10,%ymm3,%ymm3
1887         vpaddq  %ymm6,%ymm4,%ymm4
1888         vpaddq  %ymm7,%ymm0,%ymm0
1889         vpaddq  %ymm8,%ymm12,%ymm12
1890         vpaddq  %ymm9,%ymm2,%ymm2
1891
1892
1893
1894
1895         vpsrlq  $26,%ymm3,%ymm14
1896         vpand   %ymm5,%ymm3,%ymm3
1897         vpaddq  %ymm14,%ymm4,%ymm4
1898
1899         vpsrlq  $26,%ymm0,%ymm11
1900         vpand   %ymm5,%ymm0,%ymm0
1901         vpaddq  %ymm11,%ymm12,%ymm1
1902
1903         vpsrlq  $26,%ymm4,%ymm15
1904         vpand   %ymm5,%ymm4,%ymm4
1905
1906         vpsrlq  $26,%ymm1,%ymm12
1907         vpand   %ymm5,%ymm1,%ymm1
1908         vpaddq  %ymm12,%ymm2,%ymm2
1909
1910         vpaddq  %ymm15,%ymm0,%ymm0
1911         vpsllq  $2,%ymm15,%ymm15
1912         vpaddq  %ymm15,%ymm0,%ymm0
1913
1914         vpsrlq  $26,%ymm2,%ymm13
1915         vpand   %ymm5,%ymm2,%ymm2
1916         vpaddq  %ymm13,%ymm3,%ymm3
1917
1918         vpsrlq  $26,%ymm0,%ymm11
1919         vpand   %ymm5,%ymm0,%ymm0
1920         vpaddq  %ymm11,%ymm1,%ymm1
1921
1922         vpsrlq  $26,%ymm3,%ymm14
1923         vpand   %ymm5,%ymm3,%ymm3
1924         vpaddq  %ymm14,%ymm4,%ymm4
1925
1926         vmovd   %xmm0,-112(%rdi)
1927         vmovd   %xmm1,-108(%rdi)
1928         vmovd   %xmm2,-104(%rdi)
1929         vmovd   %xmm3,-100(%rdi)
1930         vmovd   %xmm4,-96(%rdi)
1931         leaq    8(%r11),%rsp
1932 .cfi_def_cfa    %rsp,8
1933         vzeroupper
1934         .byte   0xf3,0xc3
1935 .cfi_endproc    
1936 .size   poly1305_blocks_avx2,.-poly1305_blocks_avx2
1937 .align  64
1938 .Lconst:
1939 .Lmask24:
1940 .long   0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1941 .L129:
1942 .long   16777216,0,16777216,0,16777216,0,16777216,0
1943 .Lmask26:
1944 .long   0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1945 .Lpermd_avx2:
1946 .long   2,2,2,3,2,0,2,1
1947 .Lpermd_avx512:
1948 .long   0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1949
1950 .L2_44_inp_permd:
1951 .long   0,1,1,2,2,3,7,7
1952 .L2_44_inp_shift:
1953 .quad   0,12,24,64
1954 .L2_44_mask:
1955 .quad   0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1956 .L2_44_shift_rgt:
1957 .quad   44,44,42,64
1958 .L2_44_shift_lft:
1959 .quad   8,8,10,64
1960
1961 .align  64
1962 .Lx_mask44:
1963 .quad   0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1964 .quad   0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1965 .Lx_mask42:
1966 .quad   0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1967 .quad   0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1968 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1969 .align  16
1970 .globl  xor128_encrypt_n_pad
1971 .type   xor128_encrypt_n_pad,@function
1972 .align  16
1973 xor128_encrypt_n_pad:
1974 .cfi_startproc  
1975         subq    %rdx,%rsi
1976         subq    %rdx,%rdi
1977         movq    %rcx,%r10
1978         shrq    $4,%rcx
1979         jz      .Ltail_enc
1980         nop
1981 .Loop_enc_xmm:
1982         movdqu  (%rsi,%rdx,1),%xmm0
1983         pxor    (%rdx),%xmm0
1984         movdqu  %xmm0,(%rdi,%rdx,1)
1985         movdqa  %xmm0,(%rdx)
1986         leaq    16(%rdx),%rdx
1987         decq    %rcx
1988         jnz     .Loop_enc_xmm
1989
1990         andq    $15,%r10
1991         jz      .Ldone_enc
1992
1993 .Ltail_enc:
1994         movq    $16,%rcx
1995         subq    %r10,%rcx
1996         xorl    %eax,%eax
1997 .Loop_enc_byte:
1998         movb    (%rsi,%rdx,1),%al
1999         xorb    (%rdx),%al
2000         movb    %al,(%rdi,%rdx,1)
2001         movb    %al,(%rdx)
2002         leaq    1(%rdx),%rdx
2003         decq    %r10
2004         jnz     .Loop_enc_byte
2005
2006         xorl    %eax,%eax
2007 .Loop_enc_pad:
2008         movb    %al,(%rdx)
2009         leaq    1(%rdx),%rdx
2010         decq    %rcx
2011         jnz     .Loop_enc_pad
2012
2013 .Ldone_enc:
2014         movq    %rdx,%rax
2015         .byte   0xf3,0xc3
2016 .cfi_endproc    
2017 .size   xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
2018
2019 .globl  xor128_decrypt_n_pad
2020 .type   xor128_decrypt_n_pad,@function
2021 .align  16
2022 xor128_decrypt_n_pad:
2023 .cfi_startproc  
2024         subq    %rdx,%rsi
2025         subq    %rdx,%rdi
2026         movq    %rcx,%r10
2027         shrq    $4,%rcx
2028         jz      .Ltail_dec
2029         nop
2030 .Loop_dec_xmm:
2031         movdqu  (%rsi,%rdx,1),%xmm0
2032         movdqa  (%rdx),%xmm1
2033         pxor    %xmm0,%xmm1
2034         movdqu  %xmm1,(%rdi,%rdx,1)
2035         movdqa  %xmm0,(%rdx)
2036         leaq    16(%rdx),%rdx
2037         decq    %rcx
2038         jnz     .Loop_dec_xmm
2039
2040         pxor    %xmm1,%xmm1
2041         andq    $15,%r10
2042         jz      .Ldone_dec
2043
2044 .Ltail_dec:
2045         movq    $16,%rcx
2046         subq    %r10,%rcx
2047         xorl    %eax,%eax
2048         xorq    %r11,%r11
2049 .Loop_dec_byte:
2050         movb    (%rsi,%rdx,1),%r11b
2051         movb    (%rdx),%al
2052         xorb    %r11b,%al
2053         movb    %al,(%rdi,%rdx,1)
2054         movb    %r11b,(%rdx)
2055         leaq    1(%rdx),%rdx
2056         decq    %r10
2057         jnz     .Loop_dec_byte
2058
2059         xorl    %eax,%eax
2060 .Loop_dec_pad:
2061         movb    %al,(%rdx)
2062         leaq    1(%rdx),%rdx
2063         decq    %rcx
2064         jnz     .Loop_dec_pad
2065
2066 .Ldone_dec:
2067         movq    %rdx,%rax
2068         .byte   0xf3,0xc3
2069 .cfi_endproc    
2070 .size   xor128_decrypt_n_pad,.-xor128_decrypt_n_pad