]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/amd64/poly1305-x86_64.S
Merge llvm, clang, lld, lldb, compiler-rt and libc++ release_70 branch
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / amd64 / poly1305-x86_64.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */
3 .text   
4
5
6
7 .globl  poly1305_init
8 .hidden poly1305_init
9 .globl  poly1305_blocks
10 .hidden poly1305_blocks
11 .globl  poly1305_emit
12 .hidden poly1305_emit
13
14 .type   poly1305_init,@function
15 .align  32
16 poly1305_init:
17         xorq    %rax,%rax
18         movq    %rax,0(%rdi)
19         movq    %rax,8(%rdi)
20         movq    %rax,16(%rdi)
21
22         cmpq    $0,%rsi
23         je      .Lno_key
24
25         leaq    poly1305_blocks(%rip),%r10
26         leaq    poly1305_emit(%rip),%r11
27         movq    OPENSSL_ia32cap_P+4(%rip),%r9
28         leaq    poly1305_blocks_avx(%rip),%rax
29         leaq    poly1305_emit_avx(%rip),%rcx
30         btq     $28,%r9
31         cmovcq  %rax,%r10
32         cmovcq  %rcx,%r11
33         leaq    poly1305_blocks_avx2(%rip),%rax
34         btq     $37,%r9
35         cmovcq  %rax,%r10
36         movq    $0x0ffffffc0fffffff,%rax
37         movq    $0x0ffffffc0ffffffc,%rcx
38         andq    0(%rsi),%rax
39         andq    8(%rsi),%rcx
40         movq    %rax,24(%rdi)
41         movq    %rcx,32(%rdi)
42         movq    %r10,0(%rdx)
43         movq    %r11,8(%rdx)
44         movl    $1,%eax
45 .Lno_key:
46         .byte   0xf3,0xc3
47 .size   poly1305_init,.-poly1305_init
48
49 .type   poly1305_blocks,@function
50 .align  32
51 poly1305_blocks:
52 .cfi_startproc  
53 .Lblocks:
54         shrq    $4,%rdx
55         jz      .Lno_data
56
57         pushq   %rbx
58 .cfi_adjust_cfa_offset  8
59 .cfi_offset     %rbx,-16
60         pushq   %rbp
61 .cfi_adjust_cfa_offset  8
62 .cfi_offset     %rbp,-24
63         pushq   %r12
64 .cfi_adjust_cfa_offset  8
65 .cfi_offset     %r12,-32
66         pushq   %r13
67 .cfi_adjust_cfa_offset  8
68 .cfi_offset     %r13,-40
69         pushq   %r14
70 .cfi_adjust_cfa_offset  8
71 .cfi_offset     %r14,-48
72         pushq   %r15
73 .cfi_adjust_cfa_offset  8
74 .cfi_offset     %r15,-56
75 .Lblocks_body:
76
77         movq    %rdx,%r15
78
79         movq    24(%rdi),%r11
80         movq    32(%rdi),%r13
81
82         movq    0(%rdi),%r14
83         movq    8(%rdi),%rbx
84         movq    16(%rdi),%rbp
85
86         movq    %r13,%r12
87         shrq    $2,%r13
88         movq    %r12,%rax
89         addq    %r12,%r13
90         jmp     .Loop
91
92 .align  32
93 .Loop:
94         addq    0(%rsi),%r14
95         adcq    8(%rsi),%rbx
96         leaq    16(%rsi),%rsi
97         adcq    %rcx,%rbp
98         mulq    %r14
99         movq    %rax,%r9
100         movq    %r11,%rax
101         movq    %rdx,%r10
102
103         mulq    %r14
104         movq    %rax,%r14
105         movq    %r11,%rax
106         movq    %rdx,%r8
107
108         mulq    %rbx
109         addq    %rax,%r9
110         movq    %r13,%rax
111         adcq    %rdx,%r10
112
113         mulq    %rbx
114         movq    %rbp,%rbx
115         addq    %rax,%r14
116         adcq    %rdx,%r8
117
118         imulq   %r13,%rbx
119         addq    %rbx,%r9
120         movq    %r8,%rbx
121         adcq    $0,%r10
122
123         imulq   %r11,%rbp
124         addq    %r9,%rbx
125         movq    $-4,%rax
126         adcq    %rbp,%r10
127
128         andq    %r10,%rax
129         movq    %r10,%rbp
130         shrq    $2,%r10
131         andq    $3,%rbp
132         addq    %r10,%rax
133         addq    %rax,%r14
134         adcq    $0,%rbx
135         adcq    $0,%rbp
136         movq    %r12,%rax
137         decq    %r15
138         jnz     .Loop
139
140         movq    %r14,0(%rdi)
141         movq    %rbx,8(%rdi)
142         movq    %rbp,16(%rdi)
143
144         movq    0(%rsp),%r15
145 .cfi_restore    %r15
146         movq    8(%rsp),%r14
147 .cfi_restore    %r14
148         movq    16(%rsp),%r13
149 .cfi_restore    %r13
150         movq    24(%rsp),%r12
151 .cfi_restore    %r12
152         movq    32(%rsp),%rbp
153 .cfi_restore    %rbp
154         movq    40(%rsp),%rbx
155 .cfi_restore    %rbx
156         leaq    48(%rsp),%rsp
157 .cfi_adjust_cfa_offset  -48
158 .Lno_data:
159 .Lblocks_epilogue:
160         .byte   0xf3,0xc3
161 .cfi_endproc    
162 .size   poly1305_blocks,.-poly1305_blocks
163
164 .type   poly1305_emit,@function
165 .align  32
166 poly1305_emit:
167 .Lemit:
168         movq    0(%rdi),%r8
169         movq    8(%rdi),%r9
170         movq    16(%rdi),%r10
171
172         movq    %r8,%rax
173         addq    $5,%r8
174         movq    %r9,%rcx
175         adcq    $0,%r9
176         adcq    $0,%r10
177         shrq    $2,%r10
178         cmovnzq %r8,%rax
179         cmovnzq %r9,%rcx
180
181         addq    0(%rdx),%rax
182         adcq    8(%rdx),%rcx
183         movq    %rax,0(%rsi)
184         movq    %rcx,8(%rsi)
185
186         .byte   0xf3,0xc3
187 .size   poly1305_emit,.-poly1305_emit
188 .type   __poly1305_block,@function
189 .align  32
190 __poly1305_block:
191         mulq    %r14
192         movq    %rax,%r9
193         movq    %r11,%rax
194         movq    %rdx,%r10
195
196         mulq    %r14
197         movq    %rax,%r14
198         movq    %r11,%rax
199         movq    %rdx,%r8
200
201         mulq    %rbx
202         addq    %rax,%r9
203         movq    %r13,%rax
204         adcq    %rdx,%r10
205
206         mulq    %rbx
207         movq    %rbp,%rbx
208         addq    %rax,%r14
209         adcq    %rdx,%r8
210
211         imulq   %r13,%rbx
212         addq    %rbx,%r9
213         movq    %r8,%rbx
214         adcq    $0,%r10
215
216         imulq   %r11,%rbp
217         addq    %r9,%rbx
218         movq    $-4,%rax
219         adcq    %rbp,%r10
220
221         andq    %r10,%rax
222         movq    %r10,%rbp
223         shrq    $2,%r10
224         andq    $3,%rbp
225         addq    %r10,%rax
226         addq    %rax,%r14
227         adcq    $0,%rbx
228         adcq    $0,%rbp
229         .byte   0xf3,0xc3
230 .size   __poly1305_block,.-__poly1305_block
231
232 .type   __poly1305_init_avx,@function
233 .align  32
234 __poly1305_init_avx:
235         movq    %r11,%r14
236         movq    %r12,%rbx
237         xorq    %rbp,%rbp
238
239         leaq    48+64(%rdi),%rdi
240
241         movq    %r12,%rax
242         call    __poly1305_block
243
244         movl    $0x3ffffff,%eax
245         movl    $0x3ffffff,%edx
246         movq    %r14,%r8
247         andl    %r14d,%eax
248         movq    %r11,%r9
249         andl    %r11d,%edx
250         movl    %eax,-64(%rdi)
251         shrq    $26,%r8
252         movl    %edx,-60(%rdi)
253         shrq    $26,%r9
254
255         movl    $0x3ffffff,%eax
256         movl    $0x3ffffff,%edx
257         andl    %r8d,%eax
258         andl    %r9d,%edx
259         movl    %eax,-48(%rdi)
260         leal    (%rax,%rax,4),%eax
261         movl    %edx,-44(%rdi)
262         leal    (%rdx,%rdx,4),%edx
263         movl    %eax,-32(%rdi)
264         shrq    $26,%r8
265         movl    %edx,-28(%rdi)
266         shrq    $26,%r9
267
268         movq    %rbx,%rax
269         movq    %r12,%rdx
270         shlq    $12,%rax
271         shlq    $12,%rdx
272         orq     %r8,%rax
273         orq     %r9,%rdx
274         andl    $0x3ffffff,%eax
275         andl    $0x3ffffff,%edx
276         movl    %eax,-16(%rdi)
277         leal    (%rax,%rax,4),%eax
278         movl    %edx,-12(%rdi)
279         leal    (%rdx,%rdx,4),%edx
280         movl    %eax,0(%rdi)
281         movq    %rbx,%r8
282         movl    %edx,4(%rdi)
283         movq    %r12,%r9
284
285         movl    $0x3ffffff,%eax
286         movl    $0x3ffffff,%edx
287         shrq    $14,%r8
288         shrq    $14,%r9
289         andl    %r8d,%eax
290         andl    %r9d,%edx
291         movl    %eax,16(%rdi)
292         leal    (%rax,%rax,4),%eax
293         movl    %edx,20(%rdi)
294         leal    (%rdx,%rdx,4),%edx
295         movl    %eax,32(%rdi)
296         shrq    $26,%r8
297         movl    %edx,36(%rdi)
298         shrq    $26,%r9
299
300         movq    %rbp,%rax
301         shlq    $24,%rax
302         orq     %rax,%r8
303         movl    %r8d,48(%rdi)
304         leaq    (%r8,%r8,4),%r8
305         movl    %r9d,52(%rdi)
306         leaq    (%r9,%r9,4),%r9
307         movl    %r8d,64(%rdi)
308         movl    %r9d,68(%rdi)
309
310         movq    %r12,%rax
311         call    __poly1305_block
312
313         movl    $0x3ffffff,%eax
314         movq    %r14,%r8
315         andl    %r14d,%eax
316         shrq    $26,%r8
317         movl    %eax,-52(%rdi)
318
319         movl    $0x3ffffff,%edx
320         andl    %r8d,%edx
321         movl    %edx,-36(%rdi)
322         leal    (%rdx,%rdx,4),%edx
323         shrq    $26,%r8
324         movl    %edx,-20(%rdi)
325
326         movq    %rbx,%rax
327         shlq    $12,%rax
328         orq     %r8,%rax
329         andl    $0x3ffffff,%eax
330         movl    %eax,-4(%rdi)
331         leal    (%rax,%rax,4),%eax
332         movq    %rbx,%r8
333         movl    %eax,12(%rdi)
334
335         movl    $0x3ffffff,%edx
336         shrq    $14,%r8
337         andl    %r8d,%edx
338         movl    %edx,28(%rdi)
339         leal    (%rdx,%rdx,4),%edx
340         shrq    $26,%r8
341         movl    %edx,44(%rdi)
342
343         movq    %rbp,%rax
344         shlq    $24,%rax
345         orq     %rax,%r8
346         movl    %r8d,60(%rdi)
347         leaq    (%r8,%r8,4),%r8
348         movl    %r8d,76(%rdi)
349
350         movq    %r12,%rax
351         call    __poly1305_block
352
353         movl    $0x3ffffff,%eax
354         movq    %r14,%r8
355         andl    %r14d,%eax
356         shrq    $26,%r8
357         movl    %eax,-56(%rdi)
358
359         movl    $0x3ffffff,%edx
360         andl    %r8d,%edx
361         movl    %edx,-40(%rdi)
362         leal    (%rdx,%rdx,4),%edx
363         shrq    $26,%r8
364         movl    %edx,-24(%rdi)
365
366         movq    %rbx,%rax
367         shlq    $12,%rax
368         orq     %r8,%rax
369         andl    $0x3ffffff,%eax
370         movl    %eax,-8(%rdi)
371         leal    (%rax,%rax,4),%eax
372         movq    %rbx,%r8
373         movl    %eax,8(%rdi)
374
375         movl    $0x3ffffff,%edx
376         shrq    $14,%r8
377         andl    %r8d,%edx
378         movl    %edx,24(%rdi)
379         leal    (%rdx,%rdx,4),%edx
380         shrq    $26,%r8
381         movl    %edx,40(%rdi)
382
383         movq    %rbp,%rax
384         shlq    $24,%rax
385         orq     %rax,%r8
386         movl    %r8d,56(%rdi)
387         leaq    (%r8,%r8,4),%r8
388         movl    %r8d,72(%rdi)
389
390         leaq    -48-64(%rdi),%rdi
391         .byte   0xf3,0xc3
392 .size   __poly1305_init_avx,.-__poly1305_init_avx
393
394 .type   poly1305_blocks_avx,@function
395 .align  32
396 poly1305_blocks_avx:
397 .cfi_startproc  
398         movl    20(%rdi),%r8d
399         cmpq    $128,%rdx
400         jae     .Lblocks_avx
401         testl   %r8d,%r8d
402         jz      .Lblocks
403
404 .Lblocks_avx:
405         andq    $-16,%rdx
406         jz      .Lno_data_avx
407
408         vzeroupper
409
410         testl   %r8d,%r8d
411         jz      .Lbase2_64_avx
412
413         testq   $31,%rdx
414         jz      .Leven_avx
415
416         pushq   %rbx
417 .cfi_adjust_cfa_offset  8
418 .cfi_offset     %rbx,-16
419         pushq   %rbp
420 .cfi_adjust_cfa_offset  8
421 .cfi_offset     %rbp,-24
422         pushq   %r12
423 .cfi_adjust_cfa_offset  8
424 .cfi_offset     %r12,-32
425         pushq   %r13
426 .cfi_adjust_cfa_offset  8
427 .cfi_offset     %r13,-40
428         pushq   %r14
429 .cfi_adjust_cfa_offset  8
430 .cfi_offset     %r14,-48
431         pushq   %r15
432 .cfi_adjust_cfa_offset  8
433 .cfi_offset     %r15,-56
434 .Lblocks_avx_body:
435
436         movq    %rdx,%r15
437
438         movq    0(%rdi),%r8
439         movq    8(%rdi),%r9
440         movl    16(%rdi),%ebp
441
442         movq    24(%rdi),%r11
443         movq    32(%rdi),%r13
444
445
446         movl    %r8d,%r14d
447         andq    $-2147483648,%r8
448         movq    %r9,%r12
449         movl    %r9d,%ebx
450         andq    $-2147483648,%r9
451
452         shrq    $6,%r8
453         shlq    $52,%r12
454         addq    %r8,%r14
455         shrq    $12,%rbx
456         shrq    $18,%r9
457         addq    %r12,%r14
458         adcq    %r9,%rbx
459
460         movq    %rbp,%r8
461         shlq    $40,%r8
462         shrq    $24,%rbp
463         addq    %r8,%rbx
464         adcq    $0,%rbp
465
466         movq    $-4,%r9
467         movq    %rbp,%r8
468         andq    %rbp,%r9
469         shrq    $2,%r8
470         andq    $3,%rbp
471         addq    %r9,%r8
472         addq    %r8,%r14
473         adcq    $0,%rbx
474         adcq    $0,%rbp
475
476         movq    %r13,%r12
477         movq    %r13,%rax
478         shrq    $2,%r13
479         addq    %r12,%r13
480
481         addq    0(%rsi),%r14
482         adcq    8(%rsi),%rbx
483         leaq    16(%rsi),%rsi
484         adcq    %rcx,%rbp
485
486         call    __poly1305_block
487
488         testq   %rcx,%rcx
489         jz      .Lstore_base2_64_avx
490
491
492         movq    %r14,%rax
493         movq    %r14,%rdx
494         shrq    $52,%r14
495         movq    %rbx,%r11
496         movq    %rbx,%r12
497         shrq    $26,%rdx
498         andq    $0x3ffffff,%rax
499         shlq    $12,%r11
500         andq    $0x3ffffff,%rdx
501         shrq    $14,%rbx
502         orq     %r11,%r14
503         shlq    $24,%rbp
504         andq    $0x3ffffff,%r14
505         shrq    $40,%r12
506         andq    $0x3ffffff,%rbx
507         orq     %r12,%rbp
508
509         subq    $16,%r15
510         jz      .Lstore_base2_26_avx
511
512         vmovd   %eax,%xmm0
513         vmovd   %edx,%xmm1
514         vmovd   %r14d,%xmm2
515         vmovd   %ebx,%xmm3
516         vmovd   %ebp,%xmm4
517         jmp     .Lproceed_avx
518
519 .align  32
520 .Lstore_base2_64_avx:
521         movq    %r14,0(%rdi)
522         movq    %rbx,8(%rdi)
523         movq    %rbp,16(%rdi)
524         jmp     .Ldone_avx
525
526 .align  16
527 .Lstore_base2_26_avx:
528         movl    %eax,0(%rdi)
529         movl    %edx,4(%rdi)
530         movl    %r14d,8(%rdi)
531         movl    %ebx,12(%rdi)
532         movl    %ebp,16(%rdi)
533 .align  16
534 .Ldone_avx:
535         movq    0(%rsp),%r15
536 .cfi_restore    %r15
537         movq    8(%rsp),%r14
538 .cfi_restore    %r14
539         movq    16(%rsp),%r13
540 .cfi_restore    %r13
541         movq    24(%rsp),%r12
542 .cfi_restore    %r12
543         movq    32(%rsp),%rbp
544 .cfi_restore    %rbp
545         movq    40(%rsp),%rbx
546 .cfi_restore    %rbx
547         leaq    48(%rsp),%rsp
548 .cfi_adjust_cfa_offset  -48
549 .Lno_data_avx:
550 .Lblocks_avx_epilogue:
551         .byte   0xf3,0xc3
552 .cfi_endproc    
553
554 .align  32
555 .Lbase2_64_avx:
556 .cfi_startproc  
557         pushq   %rbx
558 .cfi_adjust_cfa_offset  8
559 .cfi_offset     %rbx,-16
560         pushq   %rbp
561 .cfi_adjust_cfa_offset  8
562 .cfi_offset     %rbp,-24
563         pushq   %r12
564 .cfi_adjust_cfa_offset  8
565 .cfi_offset     %r12,-32
566         pushq   %r13
567 .cfi_adjust_cfa_offset  8
568 .cfi_offset     %r13,-40
569         pushq   %r14
570 .cfi_adjust_cfa_offset  8
571 .cfi_offset     %r14,-48
572         pushq   %r15
573 .cfi_adjust_cfa_offset  8
574 .cfi_offset     %r15,-56
575 .Lbase2_64_avx_body:
576
577         movq    %rdx,%r15
578
579         movq    24(%rdi),%r11
580         movq    32(%rdi),%r13
581
582         movq    0(%rdi),%r14
583         movq    8(%rdi),%rbx
584         movl    16(%rdi),%ebp
585
586         movq    %r13,%r12
587         movq    %r13,%rax
588         shrq    $2,%r13
589         addq    %r12,%r13
590
591         testq   $31,%rdx
592         jz      .Linit_avx
593
594         addq    0(%rsi),%r14
595         adcq    8(%rsi),%rbx
596         leaq    16(%rsi),%rsi
597         adcq    %rcx,%rbp
598         subq    $16,%r15
599
600         call    __poly1305_block
601
602 .Linit_avx:
603
604         movq    %r14,%rax
605         movq    %r14,%rdx
606         shrq    $52,%r14
607         movq    %rbx,%r8
608         movq    %rbx,%r9
609         shrq    $26,%rdx
610         andq    $0x3ffffff,%rax
611         shlq    $12,%r8
612         andq    $0x3ffffff,%rdx
613         shrq    $14,%rbx
614         orq     %r8,%r14
615         shlq    $24,%rbp
616         andq    $0x3ffffff,%r14
617         shrq    $40,%r9
618         andq    $0x3ffffff,%rbx
619         orq     %r9,%rbp
620
621         vmovd   %eax,%xmm0
622         vmovd   %edx,%xmm1
623         vmovd   %r14d,%xmm2
624         vmovd   %ebx,%xmm3
625         vmovd   %ebp,%xmm4
626         movl    $1,20(%rdi)
627
628         call    __poly1305_init_avx
629
630 .Lproceed_avx:
631         movq    %r15,%rdx
632
633         movq    0(%rsp),%r15
634 .cfi_restore    %r15
635         movq    8(%rsp),%r14
636 .cfi_restore    %r14
637         movq    16(%rsp),%r13
638 .cfi_restore    %r13
639         movq    24(%rsp),%r12
640 .cfi_restore    %r12
641         movq    32(%rsp),%rbp
642 .cfi_restore    %rbp
643         movq    40(%rsp),%rbx
644 .cfi_restore    %rbx
645         leaq    48(%rsp),%rax
646         leaq    48(%rsp),%rsp
647 .cfi_adjust_cfa_offset  -48
648 .Lbase2_64_avx_epilogue:
649         jmp     .Ldo_avx
650 .cfi_endproc    
651
652 .align  32
653 .Leven_avx:
654 .cfi_startproc  
655         vmovd   0(%rdi),%xmm0
656         vmovd   4(%rdi),%xmm1
657         vmovd   8(%rdi),%xmm2
658         vmovd   12(%rdi),%xmm3
659         vmovd   16(%rdi),%xmm4
660
661 .Ldo_avx:
662         leaq    -88(%rsp),%r11
663 .cfi_def_cfa    %r11,0x60
664         subq    $0x178,%rsp
665         subq    $64,%rdx
666         leaq    -32(%rsi),%rax
667         cmovcq  %rax,%rsi
668
669         vmovdqu 48(%rdi),%xmm14
670         leaq    112(%rdi),%rdi
671         leaq    .Lconst(%rip),%rcx
672
673
674
675         vmovdqu 32(%rsi),%xmm5
676         vmovdqu 48(%rsi),%xmm6
677         vmovdqa 64(%rcx),%xmm15
678
679         vpsrldq $6,%xmm5,%xmm7
680         vpsrldq $6,%xmm6,%xmm8
681         vpunpckhqdq     %xmm6,%xmm5,%xmm9
682         vpunpcklqdq     %xmm6,%xmm5,%xmm5
683         vpunpcklqdq     %xmm8,%xmm7,%xmm8
684
685         vpsrlq  $40,%xmm9,%xmm9
686         vpsrlq  $26,%xmm5,%xmm6
687         vpand   %xmm15,%xmm5,%xmm5
688         vpsrlq  $4,%xmm8,%xmm7
689         vpand   %xmm15,%xmm6,%xmm6
690         vpsrlq  $30,%xmm8,%xmm8
691         vpand   %xmm15,%xmm7,%xmm7
692         vpand   %xmm15,%xmm8,%xmm8
693         vpor    32(%rcx),%xmm9,%xmm9
694
695         jbe     .Lskip_loop_avx
696
697
698         vmovdqu -48(%rdi),%xmm11
699         vmovdqu -32(%rdi),%xmm12
700         vpshufd $0xEE,%xmm14,%xmm13
701         vpshufd $0x44,%xmm14,%xmm10
702         vmovdqa %xmm13,-144(%r11)
703         vmovdqa %xmm10,0(%rsp)
704         vpshufd $0xEE,%xmm11,%xmm14
705         vmovdqu -16(%rdi),%xmm10
706         vpshufd $0x44,%xmm11,%xmm11
707         vmovdqa %xmm14,-128(%r11)
708         vmovdqa %xmm11,16(%rsp)
709         vpshufd $0xEE,%xmm12,%xmm13
710         vmovdqu 0(%rdi),%xmm11
711         vpshufd $0x44,%xmm12,%xmm12
712         vmovdqa %xmm13,-112(%r11)
713         vmovdqa %xmm12,32(%rsp)
714         vpshufd $0xEE,%xmm10,%xmm14
715         vmovdqu 16(%rdi),%xmm12
716         vpshufd $0x44,%xmm10,%xmm10
717         vmovdqa %xmm14,-96(%r11)
718         vmovdqa %xmm10,48(%rsp)
719         vpshufd $0xEE,%xmm11,%xmm13
720         vmovdqu 32(%rdi),%xmm10
721         vpshufd $0x44,%xmm11,%xmm11
722         vmovdqa %xmm13,-80(%r11)
723         vmovdqa %xmm11,64(%rsp)
724         vpshufd $0xEE,%xmm12,%xmm14
725         vmovdqu 48(%rdi),%xmm11
726         vpshufd $0x44,%xmm12,%xmm12
727         vmovdqa %xmm14,-64(%r11)
728         vmovdqa %xmm12,80(%rsp)
729         vpshufd $0xEE,%xmm10,%xmm13
730         vmovdqu 64(%rdi),%xmm12
731         vpshufd $0x44,%xmm10,%xmm10
732         vmovdqa %xmm13,-48(%r11)
733         vmovdqa %xmm10,96(%rsp)
734         vpshufd $0xEE,%xmm11,%xmm14
735         vpshufd $0x44,%xmm11,%xmm11
736         vmovdqa %xmm14,-32(%r11)
737         vmovdqa %xmm11,112(%rsp)
738         vpshufd $0xEE,%xmm12,%xmm13
739         vmovdqa 0(%rsp),%xmm14
740         vpshufd $0x44,%xmm12,%xmm12
741         vmovdqa %xmm13,-16(%r11)
742         vmovdqa %xmm12,128(%rsp)
743
744         jmp     .Loop_avx
745
746 .align  32
747 .Loop_avx:
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768         vpmuludq        %xmm5,%xmm14,%xmm10
769         vpmuludq        %xmm6,%xmm14,%xmm11
770         vmovdqa %xmm2,32(%r11)
771         vpmuludq        %xmm7,%xmm14,%xmm12
772         vmovdqa 16(%rsp),%xmm2
773         vpmuludq        %xmm8,%xmm14,%xmm13
774         vpmuludq        %xmm9,%xmm14,%xmm14
775
776         vmovdqa %xmm0,0(%r11)
777         vpmuludq        32(%rsp),%xmm9,%xmm0
778         vmovdqa %xmm1,16(%r11)
779         vpmuludq        %xmm8,%xmm2,%xmm1
780         vpaddq  %xmm0,%xmm10,%xmm10
781         vpaddq  %xmm1,%xmm14,%xmm14
782         vmovdqa %xmm3,48(%r11)
783         vpmuludq        %xmm7,%xmm2,%xmm0
784         vpmuludq        %xmm6,%xmm2,%xmm1
785         vpaddq  %xmm0,%xmm13,%xmm13
786         vmovdqa 48(%rsp),%xmm3
787         vpaddq  %xmm1,%xmm12,%xmm12
788         vmovdqa %xmm4,64(%r11)
789         vpmuludq        %xmm5,%xmm2,%xmm2
790         vpmuludq        %xmm7,%xmm3,%xmm0
791         vpaddq  %xmm2,%xmm11,%xmm11
792
793         vmovdqa 64(%rsp),%xmm4
794         vpaddq  %xmm0,%xmm14,%xmm14
795         vpmuludq        %xmm6,%xmm3,%xmm1
796         vpmuludq        %xmm5,%xmm3,%xmm3
797         vpaddq  %xmm1,%xmm13,%xmm13
798         vmovdqa 80(%rsp),%xmm2
799         vpaddq  %xmm3,%xmm12,%xmm12
800         vpmuludq        %xmm9,%xmm4,%xmm0
801         vpmuludq        %xmm8,%xmm4,%xmm4
802         vpaddq  %xmm0,%xmm11,%xmm11
803         vmovdqa 96(%rsp),%xmm3
804         vpaddq  %xmm4,%xmm10,%xmm10
805
806         vmovdqa 128(%rsp),%xmm4
807         vpmuludq        %xmm6,%xmm2,%xmm1
808         vpmuludq        %xmm5,%xmm2,%xmm2
809         vpaddq  %xmm1,%xmm14,%xmm14
810         vpaddq  %xmm2,%xmm13,%xmm13
811         vpmuludq        %xmm9,%xmm3,%xmm0
812         vpmuludq        %xmm8,%xmm3,%xmm1
813         vpaddq  %xmm0,%xmm12,%xmm12
814         vmovdqu 0(%rsi),%xmm0
815         vpaddq  %xmm1,%xmm11,%xmm11
816         vpmuludq        %xmm7,%xmm3,%xmm3
817         vpmuludq        %xmm7,%xmm4,%xmm7
818         vpaddq  %xmm3,%xmm10,%xmm10
819
820         vmovdqu 16(%rsi),%xmm1
821         vpaddq  %xmm7,%xmm11,%xmm11
822         vpmuludq        %xmm8,%xmm4,%xmm8
823         vpmuludq        %xmm9,%xmm4,%xmm9
824         vpsrldq $6,%xmm0,%xmm2
825         vpaddq  %xmm8,%xmm12,%xmm12
826         vpaddq  %xmm9,%xmm13,%xmm13
827         vpsrldq $6,%xmm1,%xmm3
828         vpmuludq        112(%rsp),%xmm5,%xmm9
829         vpmuludq        %xmm6,%xmm4,%xmm5
830         vpunpckhqdq     %xmm1,%xmm0,%xmm4
831         vpaddq  %xmm9,%xmm14,%xmm14
832         vmovdqa -144(%r11),%xmm9
833         vpaddq  %xmm5,%xmm10,%xmm10
834
835         vpunpcklqdq     %xmm1,%xmm0,%xmm0
836         vpunpcklqdq     %xmm3,%xmm2,%xmm3
837
838
839         vpsrldq $5,%xmm4,%xmm4
840         vpsrlq  $26,%xmm0,%xmm1
841         vpand   %xmm15,%xmm0,%xmm0
842         vpsrlq  $4,%xmm3,%xmm2
843         vpand   %xmm15,%xmm1,%xmm1
844         vpand   0(%rcx),%xmm4,%xmm4
845         vpsrlq  $30,%xmm3,%xmm3
846         vpand   %xmm15,%xmm2,%xmm2
847         vpand   %xmm15,%xmm3,%xmm3
848         vpor    32(%rcx),%xmm4,%xmm4
849
850         vpaddq  0(%r11),%xmm0,%xmm0
851         vpaddq  16(%r11),%xmm1,%xmm1
852         vpaddq  32(%r11),%xmm2,%xmm2
853         vpaddq  48(%r11),%xmm3,%xmm3
854         vpaddq  64(%r11),%xmm4,%xmm4
855
856         leaq    32(%rsi),%rax
857         leaq    64(%rsi),%rsi
858         subq    $64,%rdx
859         cmovcq  %rax,%rsi
860
861
862
863
864
865
866
867
868
869
870         vpmuludq        %xmm0,%xmm9,%xmm5
871         vpmuludq        %xmm1,%xmm9,%xmm6
872         vpaddq  %xmm5,%xmm10,%xmm10
873         vpaddq  %xmm6,%xmm11,%xmm11
874         vmovdqa -128(%r11),%xmm7
875         vpmuludq        %xmm2,%xmm9,%xmm5
876         vpmuludq        %xmm3,%xmm9,%xmm6
877         vpaddq  %xmm5,%xmm12,%xmm12
878         vpaddq  %xmm6,%xmm13,%xmm13
879         vpmuludq        %xmm4,%xmm9,%xmm9
880         vpmuludq        -112(%r11),%xmm4,%xmm5
881         vpaddq  %xmm9,%xmm14,%xmm14
882
883         vpaddq  %xmm5,%xmm10,%xmm10
884         vpmuludq        %xmm2,%xmm7,%xmm6
885         vpmuludq        %xmm3,%xmm7,%xmm5
886         vpaddq  %xmm6,%xmm13,%xmm13
887         vmovdqa -96(%r11),%xmm8
888         vpaddq  %xmm5,%xmm14,%xmm14
889         vpmuludq        %xmm1,%xmm7,%xmm6
890         vpmuludq        %xmm0,%xmm7,%xmm7
891         vpaddq  %xmm6,%xmm12,%xmm12
892         vpaddq  %xmm7,%xmm11,%xmm11
893
894         vmovdqa -80(%r11),%xmm9
895         vpmuludq        %xmm2,%xmm8,%xmm5
896         vpmuludq        %xmm1,%xmm8,%xmm6
897         vpaddq  %xmm5,%xmm14,%xmm14
898         vpaddq  %xmm6,%xmm13,%xmm13
899         vmovdqa -64(%r11),%xmm7
900         vpmuludq        %xmm0,%xmm8,%xmm8
901         vpmuludq        %xmm4,%xmm9,%xmm5
902         vpaddq  %xmm8,%xmm12,%xmm12
903         vpaddq  %xmm5,%xmm11,%xmm11
904         vmovdqa -48(%r11),%xmm8
905         vpmuludq        %xmm3,%xmm9,%xmm9
906         vpmuludq        %xmm1,%xmm7,%xmm6
907         vpaddq  %xmm9,%xmm10,%xmm10
908
909         vmovdqa -16(%r11),%xmm9
910         vpaddq  %xmm6,%xmm14,%xmm14
911         vpmuludq        %xmm0,%xmm7,%xmm7
912         vpmuludq        %xmm4,%xmm8,%xmm5
913         vpaddq  %xmm7,%xmm13,%xmm13
914         vpaddq  %xmm5,%xmm12,%xmm12
915         vmovdqu 32(%rsi),%xmm5
916         vpmuludq        %xmm3,%xmm8,%xmm7
917         vpmuludq        %xmm2,%xmm8,%xmm8
918         vpaddq  %xmm7,%xmm11,%xmm11
919         vmovdqu 48(%rsi),%xmm6
920         vpaddq  %xmm8,%xmm10,%xmm10
921
922         vpmuludq        %xmm2,%xmm9,%xmm2
923         vpmuludq        %xmm3,%xmm9,%xmm3
924         vpsrldq $6,%xmm5,%xmm7
925         vpaddq  %xmm2,%xmm11,%xmm11
926         vpmuludq        %xmm4,%xmm9,%xmm4
927         vpsrldq $6,%xmm6,%xmm8
928         vpaddq  %xmm3,%xmm12,%xmm2
929         vpaddq  %xmm4,%xmm13,%xmm3
930         vpmuludq        -32(%r11),%xmm0,%xmm4
931         vpmuludq        %xmm1,%xmm9,%xmm0
932         vpunpckhqdq     %xmm6,%xmm5,%xmm9
933         vpaddq  %xmm4,%xmm14,%xmm4
934         vpaddq  %xmm0,%xmm10,%xmm0
935
936         vpunpcklqdq     %xmm6,%xmm5,%xmm5
937         vpunpcklqdq     %xmm8,%xmm7,%xmm8
938
939
940         vpsrldq $5,%xmm9,%xmm9
941         vpsrlq  $26,%xmm5,%xmm6
942         vmovdqa 0(%rsp),%xmm14
943         vpand   %xmm15,%xmm5,%xmm5
944         vpsrlq  $4,%xmm8,%xmm7
945         vpand   %xmm15,%xmm6,%xmm6
946         vpand   0(%rcx),%xmm9,%xmm9
947         vpsrlq  $30,%xmm8,%xmm8
948         vpand   %xmm15,%xmm7,%xmm7
949         vpand   %xmm15,%xmm8,%xmm8
950         vpor    32(%rcx),%xmm9,%xmm9
951
952
953
954
955
956         vpsrlq  $26,%xmm3,%xmm13
957         vpand   %xmm15,%xmm3,%xmm3
958         vpaddq  %xmm13,%xmm4,%xmm4
959
960         vpsrlq  $26,%xmm0,%xmm10
961         vpand   %xmm15,%xmm0,%xmm0
962         vpaddq  %xmm10,%xmm11,%xmm1
963
964         vpsrlq  $26,%xmm4,%xmm10
965         vpand   %xmm15,%xmm4,%xmm4
966
967         vpsrlq  $26,%xmm1,%xmm11
968         vpand   %xmm15,%xmm1,%xmm1
969         vpaddq  %xmm11,%xmm2,%xmm2
970
971         vpaddq  %xmm10,%xmm0,%xmm0
972         vpsllq  $2,%xmm10,%xmm10
973         vpaddq  %xmm10,%xmm0,%xmm0
974
975         vpsrlq  $26,%xmm2,%xmm12
976         vpand   %xmm15,%xmm2,%xmm2
977         vpaddq  %xmm12,%xmm3,%xmm3
978
979         vpsrlq  $26,%xmm0,%xmm10
980         vpand   %xmm15,%xmm0,%xmm0
981         vpaddq  %xmm10,%xmm1,%xmm1
982
983         vpsrlq  $26,%xmm3,%xmm13
984         vpand   %xmm15,%xmm3,%xmm3
985         vpaddq  %xmm13,%xmm4,%xmm4
986
987         ja      .Loop_avx
988
989 .Lskip_loop_avx:
990
991
992
993         vpshufd $0x10,%xmm14,%xmm14
994         addq    $32,%rdx
995         jnz     .Long_tail_avx
996
997         vpaddq  %xmm2,%xmm7,%xmm7
998         vpaddq  %xmm0,%xmm5,%xmm5
999         vpaddq  %xmm1,%xmm6,%xmm6
1000         vpaddq  %xmm3,%xmm8,%xmm8
1001         vpaddq  %xmm4,%xmm9,%xmm9
1002
1003 .Long_tail_avx:
1004         vmovdqa %xmm2,32(%r11)
1005         vmovdqa %xmm0,0(%r11)
1006         vmovdqa %xmm1,16(%r11)
1007         vmovdqa %xmm3,48(%r11)
1008         vmovdqa %xmm4,64(%r11)
1009
1010
1011
1012
1013
1014
1015
1016         vpmuludq        %xmm7,%xmm14,%xmm12
1017         vpmuludq        %xmm5,%xmm14,%xmm10
1018         vpshufd $0x10,-48(%rdi),%xmm2
1019         vpmuludq        %xmm6,%xmm14,%xmm11
1020         vpmuludq        %xmm8,%xmm14,%xmm13
1021         vpmuludq        %xmm9,%xmm14,%xmm14
1022
1023         vpmuludq        %xmm8,%xmm2,%xmm0
1024         vpaddq  %xmm0,%xmm14,%xmm14
1025         vpshufd $0x10,-32(%rdi),%xmm3
1026         vpmuludq        %xmm7,%xmm2,%xmm1
1027         vpaddq  %xmm1,%xmm13,%xmm13
1028         vpshufd $0x10,-16(%rdi),%xmm4
1029         vpmuludq        %xmm6,%xmm2,%xmm0
1030         vpaddq  %xmm0,%xmm12,%xmm12
1031         vpmuludq        %xmm5,%xmm2,%xmm2
1032         vpaddq  %xmm2,%xmm11,%xmm11
1033         vpmuludq        %xmm9,%xmm3,%xmm3
1034         vpaddq  %xmm3,%xmm10,%xmm10
1035
1036         vpshufd $0x10,0(%rdi),%xmm2
1037         vpmuludq        %xmm7,%xmm4,%xmm1
1038         vpaddq  %xmm1,%xmm14,%xmm14
1039         vpmuludq        %xmm6,%xmm4,%xmm0
1040         vpaddq  %xmm0,%xmm13,%xmm13
1041         vpshufd $0x10,16(%rdi),%xmm3
1042         vpmuludq        %xmm5,%xmm4,%xmm4
1043         vpaddq  %xmm4,%xmm12,%xmm12
1044         vpmuludq        %xmm9,%xmm2,%xmm1
1045         vpaddq  %xmm1,%xmm11,%xmm11
1046         vpshufd $0x10,32(%rdi),%xmm4
1047         vpmuludq        %xmm8,%xmm2,%xmm2
1048         vpaddq  %xmm2,%xmm10,%xmm10
1049
1050         vpmuludq        %xmm6,%xmm3,%xmm0
1051         vpaddq  %xmm0,%xmm14,%xmm14
1052         vpmuludq        %xmm5,%xmm3,%xmm3
1053         vpaddq  %xmm3,%xmm13,%xmm13
1054         vpshufd $0x10,48(%rdi),%xmm2
1055         vpmuludq        %xmm9,%xmm4,%xmm1
1056         vpaddq  %xmm1,%xmm12,%xmm12
1057         vpshufd $0x10,64(%rdi),%xmm3
1058         vpmuludq        %xmm8,%xmm4,%xmm0
1059         vpaddq  %xmm0,%xmm11,%xmm11
1060         vpmuludq        %xmm7,%xmm4,%xmm4
1061         vpaddq  %xmm4,%xmm10,%xmm10
1062
1063         vpmuludq        %xmm5,%xmm2,%xmm2
1064         vpaddq  %xmm2,%xmm14,%xmm14
1065         vpmuludq        %xmm9,%xmm3,%xmm1
1066         vpaddq  %xmm1,%xmm13,%xmm13
1067         vpmuludq        %xmm8,%xmm3,%xmm0
1068         vpaddq  %xmm0,%xmm12,%xmm12
1069         vpmuludq        %xmm7,%xmm3,%xmm1
1070         vpaddq  %xmm1,%xmm11,%xmm11
1071         vpmuludq        %xmm6,%xmm3,%xmm3
1072         vpaddq  %xmm3,%xmm10,%xmm10
1073
1074         jz      .Lshort_tail_avx
1075
1076         vmovdqu 0(%rsi),%xmm0
1077         vmovdqu 16(%rsi),%xmm1
1078
1079         vpsrldq $6,%xmm0,%xmm2
1080         vpsrldq $6,%xmm1,%xmm3
1081         vpunpckhqdq     %xmm1,%xmm0,%xmm4
1082         vpunpcklqdq     %xmm1,%xmm0,%xmm0
1083         vpunpcklqdq     %xmm3,%xmm2,%xmm3
1084
1085         vpsrlq  $40,%xmm4,%xmm4
1086         vpsrlq  $26,%xmm0,%xmm1
1087         vpand   %xmm15,%xmm0,%xmm0
1088         vpsrlq  $4,%xmm3,%xmm2
1089         vpand   %xmm15,%xmm1,%xmm1
1090         vpsrlq  $30,%xmm3,%xmm3
1091         vpand   %xmm15,%xmm2,%xmm2
1092         vpand   %xmm15,%xmm3,%xmm3
1093         vpor    32(%rcx),%xmm4,%xmm4
1094
1095         vpshufd $0x32,-64(%rdi),%xmm9
1096         vpaddq  0(%r11),%xmm0,%xmm0
1097         vpaddq  16(%r11),%xmm1,%xmm1
1098         vpaddq  32(%r11),%xmm2,%xmm2
1099         vpaddq  48(%r11),%xmm3,%xmm3
1100         vpaddq  64(%r11),%xmm4,%xmm4
1101
1102
1103
1104
1105         vpmuludq        %xmm0,%xmm9,%xmm5
1106         vpaddq  %xmm5,%xmm10,%xmm10
1107         vpmuludq        %xmm1,%xmm9,%xmm6
1108         vpaddq  %xmm6,%xmm11,%xmm11
1109         vpmuludq        %xmm2,%xmm9,%xmm5
1110         vpaddq  %xmm5,%xmm12,%xmm12
1111         vpshufd $0x32,-48(%rdi),%xmm7
1112         vpmuludq        %xmm3,%xmm9,%xmm6
1113         vpaddq  %xmm6,%xmm13,%xmm13
1114         vpmuludq        %xmm4,%xmm9,%xmm9
1115         vpaddq  %xmm9,%xmm14,%xmm14
1116
1117         vpmuludq        %xmm3,%xmm7,%xmm5
1118         vpaddq  %xmm5,%xmm14,%xmm14
1119         vpshufd $0x32,-32(%rdi),%xmm8
1120         vpmuludq        %xmm2,%xmm7,%xmm6
1121         vpaddq  %xmm6,%xmm13,%xmm13
1122         vpshufd $0x32,-16(%rdi),%xmm9
1123         vpmuludq        %xmm1,%xmm7,%xmm5
1124         vpaddq  %xmm5,%xmm12,%xmm12
1125         vpmuludq        %xmm0,%xmm7,%xmm7
1126         vpaddq  %xmm7,%xmm11,%xmm11
1127         vpmuludq        %xmm4,%xmm8,%xmm8
1128         vpaddq  %xmm8,%xmm10,%xmm10
1129
1130         vpshufd $0x32,0(%rdi),%xmm7
1131         vpmuludq        %xmm2,%xmm9,%xmm6
1132         vpaddq  %xmm6,%xmm14,%xmm14
1133         vpmuludq        %xmm1,%xmm9,%xmm5
1134         vpaddq  %xmm5,%xmm13,%xmm13
1135         vpshufd $0x32,16(%rdi),%xmm8
1136         vpmuludq        %xmm0,%xmm9,%xmm9
1137         vpaddq  %xmm9,%xmm12,%xmm12
1138         vpmuludq        %xmm4,%xmm7,%xmm6
1139         vpaddq  %xmm6,%xmm11,%xmm11
1140         vpshufd $0x32,32(%rdi),%xmm9
1141         vpmuludq        %xmm3,%xmm7,%xmm7
1142         vpaddq  %xmm7,%xmm10,%xmm10
1143
1144         vpmuludq        %xmm1,%xmm8,%xmm5
1145         vpaddq  %xmm5,%xmm14,%xmm14
1146         vpmuludq        %xmm0,%xmm8,%xmm8
1147         vpaddq  %xmm8,%xmm13,%xmm13
1148         vpshufd $0x32,48(%rdi),%xmm7
1149         vpmuludq        %xmm4,%xmm9,%xmm6
1150         vpaddq  %xmm6,%xmm12,%xmm12
1151         vpshufd $0x32,64(%rdi),%xmm8
1152         vpmuludq        %xmm3,%xmm9,%xmm5
1153         vpaddq  %xmm5,%xmm11,%xmm11
1154         vpmuludq        %xmm2,%xmm9,%xmm9
1155         vpaddq  %xmm9,%xmm10,%xmm10
1156
1157         vpmuludq        %xmm0,%xmm7,%xmm7
1158         vpaddq  %xmm7,%xmm14,%xmm14
1159         vpmuludq        %xmm4,%xmm8,%xmm6
1160         vpaddq  %xmm6,%xmm13,%xmm13
1161         vpmuludq        %xmm3,%xmm8,%xmm5
1162         vpaddq  %xmm5,%xmm12,%xmm12
1163         vpmuludq        %xmm2,%xmm8,%xmm6
1164         vpaddq  %xmm6,%xmm11,%xmm11
1165         vpmuludq        %xmm1,%xmm8,%xmm8
1166         vpaddq  %xmm8,%xmm10,%xmm10
1167
1168 .Lshort_tail_avx:
1169
1170
1171
1172         vpsrldq $8,%xmm14,%xmm9
1173         vpsrldq $8,%xmm13,%xmm8
1174         vpsrldq $8,%xmm11,%xmm6
1175         vpsrldq $8,%xmm10,%xmm5
1176         vpsrldq $8,%xmm12,%xmm7
1177         vpaddq  %xmm8,%xmm13,%xmm13
1178         vpaddq  %xmm9,%xmm14,%xmm14
1179         vpaddq  %xmm5,%xmm10,%xmm10
1180         vpaddq  %xmm6,%xmm11,%xmm11
1181         vpaddq  %xmm7,%xmm12,%xmm12
1182
1183
1184
1185
1186         vpsrlq  $26,%xmm13,%xmm3
1187         vpand   %xmm15,%xmm13,%xmm13
1188         vpaddq  %xmm3,%xmm14,%xmm14
1189
1190         vpsrlq  $26,%xmm10,%xmm0
1191         vpand   %xmm15,%xmm10,%xmm10
1192         vpaddq  %xmm0,%xmm11,%xmm11
1193
1194         vpsrlq  $26,%xmm14,%xmm4
1195         vpand   %xmm15,%xmm14,%xmm14
1196
1197         vpsrlq  $26,%xmm11,%xmm1
1198         vpand   %xmm15,%xmm11,%xmm11
1199         vpaddq  %xmm1,%xmm12,%xmm12
1200
1201         vpaddq  %xmm4,%xmm10,%xmm10
1202         vpsllq  $2,%xmm4,%xmm4
1203         vpaddq  %xmm4,%xmm10,%xmm10
1204
1205         vpsrlq  $26,%xmm12,%xmm2
1206         vpand   %xmm15,%xmm12,%xmm12
1207         vpaddq  %xmm2,%xmm13,%xmm13
1208
1209         vpsrlq  $26,%xmm10,%xmm0
1210         vpand   %xmm15,%xmm10,%xmm10
1211         vpaddq  %xmm0,%xmm11,%xmm11
1212
1213         vpsrlq  $26,%xmm13,%xmm3
1214         vpand   %xmm15,%xmm13,%xmm13
1215         vpaddq  %xmm3,%xmm14,%xmm14
1216
1217         vmovd   %xmm10,-112(%rdi)
1218         vmovd   %xmm11,-108(%rdi)
1219         vmovd   %xmm12,-104(%rdi)
1220         vmovd   %xmm13,-100(%rdi)
1221         vmovd   %xmm14,-96(%rdi)
1222         leaq    88(%r11),%rsp
1223 .cfi_def_cfa    %rsp,8
1224         vzeroupper
1225         .byte   0xf3,0xc3
1226 .cfi_endproc    
1227 .size   poly1305_blocks_avx,.-poly1305_blocks_avx
1228
1229 .type   poly1305_emit_avx,@function
1230 .align  32
1231 poly1305_emit_avx:
1232         cmpl    $0,20(%rdi)
1233         je      .Lemit
1234
1235         movl    0(%rdi),%eax
1236         movl    4(%rdi),%ecx
1237         movl    8(%rdi),%r8d
1238         movl    12(%rdi),%r11d
1239         movl    16(%rdi),%r10d
1240
1241         shlq    $26,%rcx
1242         movq    %r8,%r9
1243         shlq    $52,%r8
1244         addq    %rcx,%rax
1245         shrq    $12,%r9
1246         addq    %rax,%r8
1247         adcq    $0,%r9
1248
1249         shlq    $14,%r11
1250         movq    %r10,%rax
1251         shrq    $24,%r10
1252         addq    %r11,%r9
1253         shlq    $40,%rax
1254         addq    %rax,%r9
1255         adcq    $0,%r10
1256
1257         movq    %r10,%rax
1258         movq    %r10,%rcx
1259         andq    $3,%r10
1260         shrq    $2,%rax
1261         andq    $-4,%rcx
1262         addq    %rcx,%rax
1263         addq    %rax,%r8
1264         adcq    $0,%r9
1265         adcq    $0,%r10
1266
1267         movq    %r8,%rax
1268         addq    $5,%r8
1269         movq    %r9,%rcx
1270         adcq    $0,%r9
1271         adcq    $0,%r10
1272         shrq    $2,%r10
1273         cmovnzq %r8,%rax
1274         cmovnzq %r9,%rcx
1275
1276         addq    0(%rdx),%rax
1277         adcq    8(%rdx),%rcx
1278         movq    %rax,0(%rsi)
1279         movq    %rcx,8(%rsi)
1280
1281         .byte   0xf3,0xc3
1282 .size   poly1305_emit_avx,.-poly1305_emit_avx
1283 .type   poly1305_blocks_avx2,@function
1284 .align  32
1285 poly1305_blocks_avx2:
1286 .cfi_startproc  
1287         movl    20(%rdi),%r8d
1288         cmpq    $128,%rdx
1289         jae     .Lblocks_avx2
1290         testl   %r8d,%r8d
1291         jz      .Lblocks
1292
1293 .Lblocks_avx2:
1294         andq    $-16,%rdx
1295         jz      .Lno_data_avx2
1296
1297         vzeroupper
1298
1299         testl   %r8d,%r8d
1300         jz      .Lbase2_64_avx2
1301
1302         testq   $63,%rdx
1303         jz      .Leven_avx2
1304
1305         pushq   %rbx
1306 .cfi_adjust_cfa_offset  8
1307 .cfi_offset     %rbx,-16
1308         pushq   %rbp
1309 .cfi_adjust_cfa_offset  8
1310 .cfi_offset     %rbp,-24
1311         pushq   %r12
1312 .cfi_adjust_cfa_offset  8
1313 .cfi_offset     %r12,-32
1314         pushq   %r13
1315 .cfi_adjust_cfa_offset  8
1316 .cfi_offset     %r13,-40
1317         pushq   %r14
1318 .cfi_adjust_cfa_offset  8
1319 .cfi_offset     %r14,-48
1320         pushq   %r15
1321 .cfi_adjust_cfa_offset  8
1322 .cfi_offset     %r15,-56
1323 .Lblocks_avx2_body:
1324
1325         movq    %rdx,%r15
1326
1327         movq    0(%rdi),%r8
1328         movq    8(%rdi),%r9
1329         movl    16(%rdi),%ebp
1330
1331         movq    24(%rdi),%r11
1332         movq    32(%rdi),%r13
1333
1334
1335         movl    %r8d,%r14d
1336         andq    $-2147483648,%r8
1337         movq    %r9,%r12
1338         movl    %r9d,%ebx
1339         andq    $-2147483648,%r9
1340
1341         shrq    $6,%r8
1342         shlq    $52,%r12
1343         addq    %r8,%r14
1344         shrq    $12,%rbx
1345         shrq    $18,%r9
1346         addq    %r12,%r14
1347         adcq    %r9,%rbx
1348
1349         movq    %rbp,%r8
1350         shlq    $40,%r8
1351         shrq    $24,%rbp
1352         addq    %r8,%rbx
1353         adcq    $0,%rbp
1354
1355         movq    $-4,%r9
1356         movq    %rbp,%r8
1357         andq    %rbp,%r9
1358         shrq    $2,%r8
1359         andq    $3,%rbp
1360         addq    %r9,%r8
1361         addq    %r8,%r14
1362         adcq    $0,%rbx
1363         adcq    $0,%rbp
1364
1365         movq    %r13,%r12
1366         movq    %r13,%rax
1367         shrq    $2,%r13
1368         addq    %r12,%r13
1369
1370 .Lbase2_26_pre_avx2:
1371         addq    0(%rsi),%r14
1372         adcq    8(%rsi),%rbx
1373         leaq    16(%rsi),%rsi
1374         adcq    %rcx,%rbp
1375         subq    $16,%r15
1376
1377         call    __poly1305_block
1378         movq    %r12,%rax
1379
1380         testq   $63,%r15
1381         jnz     .Lbase2_26_pre_avx2
1382
1383         testq   %rcx,%rcx
1384         jz      .Lstore_base2_64_avx2
1385
1386
1387         movq    %r14,%rax
1388         movq    %r14,%rdx
1389         shrq    $52,%r14
1390         movq    %rbx,%r11
1391         movq    %rbx,%r12
1392         shrq    $26,%rdx
1393         andq    $0x3ffffff,%rax
1394         shlq    $12,%r11
1395         andq    $0x3ffffff,%rdx
1396         shrq    $14,%rbx
1397         orq     %r11,%r14
1398         shlq    $24,%rbp
1399         andq    $0x3ffffff,%r14
1400         shrq    $40,%r12
1401         andq    $0x3ffffff,%rbx
1402         orq     %r12,%rbp
1403
1404         testq   %r15,%r15
1405         jz      .Lstore_base2_26_avx2
1406
1407         vmovd   %eax,%xmm0
1408         vmovd   %edx,%xmm1
1409         vmovd   %r14d,%xmm2
1410         vmovd   %ebx,%xmm3
1411         vmovd   %ebp,%xmm4
1412         jmp     .Lproceed_avx2
1413
1414 .align  32
1415 .Lstore_base2_64_avx2:
1416         movq    %r14,0(%rdi)
1417         movq    %rbx,8(%rdi)
1418         movq    %rbp,16(%rdi)
1419         jmp     .Ldone_avx2
1420
1421 .align  16
1422 .Lstore_base2_26_avx2:
1423         movl    %eax,0(%rdi)
1424         movl    %edx,4(%rdi)
1425         movl    %r14d,8(%rdi)
1426         movl    %ebx,12(%rdi)
1427         movl    %ebp,16(%rdi)
1428 .align  16
1429 .Ldone_avx2:
1430         movq    0(%rsp),%r15
1431 .cfi_restore    %r15
1432         movq    8(%rsp),%r14
1433 .cfi_restore    %r14
1434         movq    16(%rsp),%r13
1435 .cfi_restore    %r13
1436         movq    24(%rsp),%r12
1437 .cfi_restore    %r12
1438         movq    32(%rsp),%rbp
1439 .cfi_restore    %rbp
1440         movq    40(%rsp),%rbx
1441 .cfi_restore    %rbx
1442         leaq    48(%rsp),%rsp
1443 .cfi_adjust_cfa_offset  -48
1444 .Lno_data_avx2:
1445 .Lblocks_avx2_epilogue:
1446         .byte   0xf3,0xc3
1447 .cfi_endproc    
1448
1449 .align  32
1450 .Lbase2_64_avx2:
1451 .cfi_startproc  
1452         pushq   %rbx
1453 .cfi_adjust_cfa_offset  8
1454 .cfi_offset     %rbx,-16
1455         pushq   %rbp
1456 .cfi_adjust_cfa_offset  8
1457 .cfi_offset     %rbp,-24
1458         pushq   %r12
1459 .cfi_adjust_cfa_offset  8
1460 .cfi_offset     %r12,-32
1461         pushq   %r13
1462 .cfi_adjust_cfa_offset  8
1463 .cfi_offset     %r13,-40
1464         pushq   %r14
1465 .cfi_adjust_cfa_offset  8
1466 .cfi_offset     %r14,-48
1467         pushq   %r15
1468 .cfi_adjust_cfa_offset  8
1469 .cfi_offset     %r15,-56
1470 .Lbase2_64_avx2_body:
1471
1472         movq    %rdx,%r15
1473
1474         movq    24(%rdi),%r11
1475         movq    32(%rdi),%r13
1476
1477         movq    0(%rdi),%r14
1478         movq    8(%rdi),%rbx
1479         movl    16(%rdi),%ebp
1480
1481         movq    %r13,%r12
1482         movq    %r13,%rax
1483         shrq    $2,%r13
1484         addq    %r12,%r13
1485
1486         testq   $63,%rdx
1487         jz      .Linit_avx2
1488
1489 .Lbase2_64_pre_avx2:
1490         addq    0(%rsi),%r14
1491         adcq    8(%rsi),%rbx
1492         leaq    16(%rsi),%rsi
1493         adcq    %rcx,%rbp
1494         subq    $16,%r15
1495
1496         call    __poly1305_block
1497         movq    %r12,%rax
1498
1499         testq   $63,%r15
1500         jnz     .Lbase2_64_pre_avx2
1501
1502 .Linit_avx2:
1503
1504         movq    %r14,%rax
1505         movq    %r14,%rdx
1506         shrq    $52,%r14
1507         movq    %rbx,%r8
1508         movq    %rbx,%r9
1509         shrq    $26,%rdx
1510         andq    $0x3ffffff,%rax
1511         shlq    $12,%r8
1512         andq    $0x3ffffff,%rdx
1513         shrq    $14,%rbx
1514         orq     %r8,%r14
1515         shlq    $24,%rbp
1516         andq    $0x3ffffff,%r14
1517         shrq    $40,%r9
1518         andq    $0x3ffffff,%rbx
1519         orq     %r9,%rbp
1520
1521         vmovd   %eax,%xmm0
1522         vmovd   %edx,%xmm1
1523         vmovd   %r14d,%xmm2
1524         vmovd   %ebx,%xmm3
1525         vmovd   %ebp,%xmm4
1526         movl    $1,20(%rdi)
1527
1528         call    __poly1305_init_avx
1529
1530 .Lproceed_avx2:
1531         movq    %r15,%rdx
1532         movl    OPENSSL_ia32cap_P+8(%rip),%r10d
1533         movl    $3221291008,%r11d
1534
1535         movq    0(%rsp),%r15
1536 .cfi_restore    %r15
1537         movq    8(%rsp),%r14
1538 .cfi_restore    %r14
1539         movq    16(%rsp),%r13
1540 .cfi_restore    %r13
1541         movq    24(%rsp),%r12
1542 .cfi_restore    %r12
1543         movq    32(%rsp),%rbp
1544 .cfi_restore    %rbp
1545         movq    40(%rsp),%rbx
1546 .cfi_restore    %rbx
1547         leaq    48(%rsp),%rax
1548         leaq    48(%rsp),%rsp
1549 .cfi_adjust_cfa_offset  -48
1550 .Lbase2_64_avx2_epilogue:
1551         jmp     .Ldo_avx2
1552 .cfi_endproc    
1553
1554 .align  32
1555 .Leven_avx2:
1556 .cfi_startproc  
1557         movl    OPENSSL_ia32cap_P+8(%rip),%r10d
1558         vmovd   0(%rdi),%xmm0
1559         vmovd   4(%rdi),%xmm1
1560         vmovd   8(%rdi),%xmm2
1561         vmovd   12(%rdi),%xmm3
1562         vmovd   16(%rdi),%xmm4
1563
1564 .Ldo_avx2:
1565         leaq    -8(%rsp),%r11
1566 .cfi_def_cfa    %r11,16
1567         subq    $0x128,%rsp
1568         leaq    .Lconst(%rip),%rcx
1569         leaq    48+64(%rdi),%rdi
1570         vmovdqa 96(%rcx),%ymm7
1571
1572
1573         vmovdqu -64(%rdi),%xmm9
1574         andq    $-512,%rsp
1575         vmovdqu -48(%rdi),%xmm10
1576         vmovdqu -32(%rdi),%xmm6
1577         vmovdqu -16(%rdi),%xmm11
1578         vmovdqu 0(%rdi),%xmm12
1579         vmovdqu 16(%rdi),%xmm13
1580         leaq    144(%rsp),%rax
1581         vmovdqu 32(%rdi),%xmm14
1582         vpermd  %ymm9,%ymm7,%ymm9
1583         vmovdqu 48(%rdi),%xmm15
1584         vpermd  %ymm10,%ymm7,%ymm10
1585         vmovdqu 64(%rdi),%xmm5
1586         vpermd  %ymm6,%ymm7,%ymm6
1587         vmovdqa %ymm9,0(%rsp)
1588         vpermd  %ymm11,%ymm7,%ymm11
1589         vmovdqa %ymm10,32-144(%rax)
1590         vpermd  %ymm12,%ymm7,%ymm12
1591         vmovdqa %ymm6,64-144(%rax)
1592         vpermd  %ymm13,%ymm7,%ymm13
1593         vmovdqa %ymm11,96-144(%rax)
1594         vpermd  %ymm14,%ymm7,%ymm14
1595         vmovdqa %ymm12,128-144(%rax)
1596         vpermd  %ymm15,%ymm7,%ymm15
1597         vmovdqa %ymm13,160-144(%rax)
1598         vpermd  %ymm5,%ymm7,%ymm5
1599         vmovdqa %ymm14,192-144(%rax)
1600         vmovdqa %ymm15,224-144(%rax)
1601         vmovdqa %ymm5,256-144(%rax)
1602         vmovdqa 64(%rcx),%ymm5
1603
1604
1605
1606         vmovdqu 0(%rsi),%xmm7
1607         vmovdqu 16(%rsi),%xmm8
1608         vinserti128     $1,32(%rsi),%ymm7,%ymm7
1609         vinserti128     $1,48(%rsi),%ymm8,%ymm8
1610         leaq    64(%rsi),%rsi
1611
1612         vpsrldq $6,%ymm7,%ymm9
1613         vpsrldq $6,%ymm8,%ymm10
1614         vpunpckhqdq     %ymm8,%ymm7,%ymm6
1615         vpunpcklqdq     %ymm10,%ymm9,%ymm9
1616         vpunpcklqdq     %ymm8,%ymm7,%ymm7
1617
1618         vpsrlq  $30,%ymm9,%ymm10
1619         vpsrlq  $4,%ymm9,%ymm9
1620         vpsrlq  $26,%ymm7,%ymm8
1621         vpsrlq  $40,%ymm6,%ymm6
1622         vpand   %ymm5,%ymm9,%ymm9
1623         vpand   %ymm5,%ymm7,%ymm7
1624         vpand   %ymm5,%ymm8,%ymm8
1625         vpand   %ymm5,%ymm10,%ymm10
1626         vpor    32(%rcx),%ymm6,%ymm6
1627
1628         vpaddq  %ymm2,%ymm9,%ymm2
1629         subq    $64,%rdx
1630         jz      .Ltail_avx2
1631         jmp     .Loop_avx2
1632
1633 .align  32
1634 .Loop_avx2:
1635
1636
1637
1638
1639
1640
1641
1642
1643         vpaddq  %ymm0,%ymm7,%ymm0
1644         vmovdqa 0(%rsp),%ymm7
1645         vpaddq  %ymm1,%ymm8,%ymm1
1646         vmovdqa 32(%rsp),%ymm8
1647         vpaddq  %ymm3,%ymm10,%ymm3
1648         vmovdqa 96(%rsp),%ymm9
1649         vpaddq  %ymm4,%ymm6,%ymm4
1650         vmovdqa 48(%rax),%ymm10
1651         vmovdqa 112(%rax),%ymm5
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668         vpmuludq        %ymm2,%ymm7,%ymm13
1669         vpmuludq        %ymm2,%ymm8,%ymm14
1670         vpmuludq        %ymm2,%ymm9,%ymm15
1671         vpmuludq        %ymm2,%ymm10,%ymm11
1672         vpmuludq        %ymm2,%ymm5,%ymm12
1673
1674         vpmuludq        %ymm0,%ymm8,%ymm6
1675         vpmuludq        %ymm1,%ymm8,%ymm2
1676         vpaddq  %ymm6,%ymm12,%ymm12
1677         vpaddq  %ymm2,%ymm13,%ymm13
1678         vpmuludq        %ymm3,%ymm8,%ymm6
1679         vpmuludq        64(%rsp),%ymm4,%ymm2
1680         vpaddq  %ymm6,%ymm15,%ymm15
1681         vpaddq  %ymm2,%ymm11,%ymm11
1682         vmovdqa -16(%rax),%ymm8
1683
1684         vpmuludq        %ymm0,%ymm7,%ymm6
1685         vpmuludq        %ymm1,%ymm7,%ymm2
1686         vpaddq  %ymm6,%ymm11,%ymm11
1687         vpaddq  %ymm2,%ymm12,%ymm12
1688         vpmuludq        %ymm3,%ymm7,%ymm6
1689         vpmuludq        %ymm4,%ymm7,%ymm2
1690         vmovdqu 0(%rsi),%xmm7
1691         vpaddq  %ymm6,%ymm14,%ymm14
1692         vpaddq  %ymm2,%ymm15,%ymm15
1693         vinserti128     $1,32(%rsi),%ymm7,%ymm7
1694
1695         vpmuludq        %ymm3,%ymm8,%ymm6
1696         vpmuludq        %ymm4,%ymm8,%ymm2
1697         vmovdqu 16(%rsi),%xmm8
1698         vpaddq  %ymm6,%ymm11,%ymm11
1699         vpaddq  %ymm2,%ymm12,%ymm12
1700         vmovdqa 16(%rax),%ymm2
1701         vpmuludq        %ymm1,%ymm9,%ymm6
1702         vpmuludq        %ymm0,%ymm9,%ymm9
1703         vpaddq  %ymm6,%ymm14,%ymm14
1704         vpaddq  %ymm9,%ymm13,%ymm13
1705         vinserti128     $1,48(%rsi),%ymm8,%ymm8
1706         leaq    64(%rsi),%rsi
1707
1708         vpmuludq        %ymm1,%ymm2,%ymm6
1709         vpmuludq        %ymm0,%ymm2,%ymm2
1710         vpsrldq $6,%ymm7,%ymm9
1711         vpaddq  %ymm6,%ymm15,%ymm15
1712         vpaddq  %ymm2,%ymm14,%ymm14
1713         vpmuludq        %ymm3,%ymm10,%ymm6
1714         vpmuludq        %ymm4,%ymm10,%ymm2
1715         vpsrldq $6,%ymm8,%ymm10
1716         vpaddq  %ymm6,%ymm12,%ymm12
1717         vpaddq  %ymm2,%ymm13,%ymm13
1718         vpunpckhqdq     %ymm8,%ymm7,%ymm6
1719
1720         vpmuludq        %ymm3,%ymm5,%ymm3
1721         vpmuludq        %ymm4,%ymm5,%ymm4
1722         vpunpcklqdq     %ymm8,%ymm7,%ymm7
1723         vpaddq  %ymm3,%ymm13,%ymm2
1724         vpaddq  %ymm4,%ymm14,%ymm3
1725         vpunpcklqdq     %ymm10,%ymm9,%ymm10
1726         vpmuludq        80(%rax),%ymm0,%ymm4
1727         vpmuludq        %ymm1,%ymm5,%ymm0
1728         vmovdqa 64(%rcx),%ymm5
1729         vpaddq  %ymm4,%ymm15,%ymm4
1730         vpaddq  %ymm0,%ymm11,%ymm0
1731
1732
1733
1734
1735         vpsrlq  $26,%ymm3,%ymm14
1736         vpand   %ymm5,%ymm3,%ymm3
1737         vpaddq  %ymm14,%ymm4,%ymm4
1738
1739         vpsrlq  $26,%ymm0,%ymm11
1740         vpand   %ymm5,%ymm0,%ymm0
1741         vpaddq  %ymm11,%ymm12,%ymm1
1742
1743         vpsrlq  $26,%ymm4,%ymm15
1744         vpand   %ymm5,%ymm4,%ymm4
1745
1746         vpsrlq  $4,%ymm10,%ymm9
1747
1748         vpsrlq  $26,%ymm1,%ymm12
1749         vpand   %ymm5,%ymm1,%ymm1
1750         vpaddq  %ymm12,%ymm2,%ymm2
1751
1752         vpaddq  %ymm15,%ymm0,%ymm0
1753         vpsllq  $2,%ymm15,%ymm15
1754         vpaddq  %ymm15,%ymm0,%ymm0
1755
1756         vpand   %ymm5,%ymm9,%ymm9
1757         vpsrlq  $26,%ymm7,%ymm8
1758
1759         vpsrlq  $26,%ymm2,%ymm13
1760         vpand   %ymm5,%ymm2,%ymm2
1761         vpaddq  %ymm13,%ymm3,%ymm3
1762
1763         vpaddq  %ymm9,%ymm2,%ymm2
1764         vpsrlq  $30,%ymm10,%ymm10
1765
1766         vpsrlq  $26,%ymm0,%ymm11
1767         vpand   %ymm5,%ymm0,%ymm0
1768         vpaddq  %ymm11,%ymm1,%ymm1
1769
1770         vpsrlq  $40,%ymm6,%ymm6
1771
1772         vpsrlq  $26,%ymm3,%ymm14
1773         vpand   %ymm5,%ymm3,%ymm3
1774         vpaddq  %ymm14,%ymm4,%ymm4
1775
1776         vpand   %ymm5,%ymm7,%ymm7
1777         vpand   %ymm5,%ymm8,%ymm8
1778         vpand   %ymm5,%ymm10,%ymm10
1779         vpor    32(%rcx),%ymm6,%ymm6
1780
1781         subq    $64,%rdx
1782         jnz     .Loop_avx2
1783
1784 .byte   0x66,0x90
1785 .Ltail_avx2:
1786
1787
1788
1789
1790
1791
1792
1793         vpaddq  %ymm0,%ymm7,%ymm0
1794         vmovdqu 4(%rsp),%ymm7
1795         vpaddq  %ymm1,%ymm8,%ymm1
1796         vmovdqu 36(%rsp),%ymm8
1797         vpaddq  %ymm3,%ymm10,%ymm3
1798         vmovdqu 100(%rsp),%ymm9
1799         vpaddq  %ymm4,%ymm6,%ymm4
1800         vmovdqu 52(%rax),%ymm10
1801         vmovdqu 116(%rax),%ymm5
1802
1803         vpmuludq        %ymm2,%ymm7,%ymm13
1804         vpmuludq        %ymm2,%ymm8,%ymm14
1805         vpmuludq        %ymm2,%ymm9,%ymm15
1806         vpmuludq        %ymm2,%ymm10,%ymm11
1807         vpmuludq        %ymm2,%ymm5,%ymm12
1808
1809         vpmuludq        %ymm0,%ymm8,%ymm6
1810         vpmuludq        %ymm1,%ymm8,%ymm2
1811         vpaddq  %ymm6,%ymm12,%ymm12
1812         vpaddq  %ymm2,%ymm13,%ymm13
1813         vpmuludq        %ymm3,%ymm8,%ymm6
1814         vpmuludq        68(%rsp),%ymm4,%ymm2
1815         vpaddq  %ymm6,%ymm15,%ymm15
1816         vpaddq  %ymm2,%ymm11,%ymm11
1817
1818         vpmuludq        %ymm0,%ymm7,%ymm6
1819         vpmuludq        %ymm1,%ymm7,%ymm2
1820         vpaddq  %ymm6,%ymm11,%ymm11
1821         vmovdqu -12(%rax),%ymm8
1822         vpaddq  %ymm2,%ymm12,%ymm12
1823         vpmuludq        %ymm3,%ymm7,%ymm6
1824         vpmuludq        %ymm4,%ymm7,%ymm2
1825         vpaddq  %ymm6,%ymm14,%ymm14
1826         vpaddq  %ymm2,%ymm15,%ymm15
1827
1828         vpmuludq        %ymm3,%ymm8,%ymm6
1829         vpmuludq        %ymm4,%ymm8,%ymm2
1830         vpaddq  %ymm6,%ymm11,%ymm11
1831         vpaddq  %ymm2,%ymm12,%ymm12
1832         vmovdqu 20(%rax),%ymm2
1833         vpmuludq        %ymm1,%ymm9,%ymm6
1834         vpmuludq        %ymm0,%ymm9,%ymm9
1835         vpaddq  %ymm6,%ymm14,%ymm14
1836         vpaddq  %ymm9,%ymm13,%ymm13
1837
1838         vpmuludq        %ymm1,%ymm2,%ymm6
1839         vpmuludq        %ymm0,%ymm2,%ymm2
1840         vpaddq  %ymm6,%ymm15,%ymm15
1841         vpaddq  %ymm2,%ymm14,%ymm14
1842         vpmuludq        %ymm3,%ymm10,%ymm6
1843         vpmuludq        %ymm4,%ymm10,%ymm2
1844         vpaddq  %ymm6,%ymm12,%ymm12
1845         vpaddq  %ymm2,%ymm13,%ymm13
1846
1847         vpmuludq        %ymm3,%ymm5,%ymm3
1848         vpmuludq        %ymm4,%ymm5,%ymm4
1849         vpaddq  %ymm3,%ymm13,%ymm2
1850         vpaddq  %ymm4,%ymm14,%ymm3
1851         vpmuludq        84(%rax),%ymm0,%ymm4
1852         vpmuludq        %ymm1,%ymm5,%ymm0
1853         vmovdqa 64(%rcx),%ymm5
1854         vpaddq  %ymm4,%ymm15,%ymm4
1855         vpaddq  %ymm0,%ymm11,%ymm0
1856
1857
1858
1859
1860         vpsrldq $8,%ymm12,%ymm8
1861         vpsrldq $8,%ymm2,%ymm9
1862         vpsrldq $8,%ymm3,%ymm10
1863         vpsrldq $8,%ymm4,%ymm6
1864         vpsrldq $8,%ymm0,%ymm7
1865         vpaddq  %ymm8,%ymm12,%ymm12
1866         vpaddq  %ymm9,%ymm2,%ymm2
1867         vpaddq  %ymm10,%ymm3,%ymm3
1868         vpaddq  %ymm6,%ymm4,%ymm4
1869         vpaddq  %ymm7,%ymm0,%ymm0
1870
1871         vpermq  $0x2,%ymm3,%ymm10
1872         vpermq  $0x2,%ymm4,%ymm6
1873         vpermq  $0x2,%ymm0,%ymm7
1874         vpermq  $0x2,%ymm12,%ymm8
1875         vpermq  $0x2,%ymm2,%ymm9
1876         vpaddq  %ymm10,%ymm3,%ymm3
1877         vpaddq  %ymm6,%ymm4,%ymm4
1878         vpaddq  %ymm7,%ymm0,%ymm0
1879         vpaddq  %ymm8,%ymm12,%ymm12
1880         vpaddq  %ymm9,%ymm2,%ymm2
1881
1882
1883
1884
1885         vpsrlq  $26,%ymm3,%ymm14
1886         vpand   %ymm5,%ymm3,%ymm3
1887         vpaddq  %ymm14,%ymm4,%ymm4
1888
1889         vpsrlq  $26,%ymm0,%ymm11
1890         vpand   %ymm5,%ymm0,%ymm0
1891         vpaddq  %ymm11,%ymm12,%ymm1
1892
1893         vpsrlq  $26,%ymm4,%ymm15
1894         vpand   %ymm5,%ymm4,%ymm4
1895
1896         vpsrlq  $26,%ymm1,%ymm12
1897         vpand   %ymm5,%ymm1,%ymm1
1898         vpaddq  %ymm12,%ymm2,%ymm2
1899
1900         vpaddq  %ymm15,%ymm0,%ymm0
1901         vpsllq  $2,%ymm15,%ymm15
1902         vpaddq  %ymm15,%ymm0,%ymm0
1903
1904         vpsrlq  $26,%ymm2,%ymm13
1905         vpand   %ymm5,%ymm2,%ymm2
1906         vpaddq  %ymm13,%ymm3,%ymm3
1907
1908         vpsrlq  $26,%ymm0,%ymm11
1909         vpand   %ymm5,%ymm0,%ymm0
1910         vpaddq  %ymm11,%ymm1,%ymm1
1911
1912         vpsrlq  $26,%ymm3,%ymm14
1913         vpand   %ymm5,%ymm3,%ymm3
1914         vpaddq  %ymm14,%ymm4,%ymm4
1915
1916         vmovd   %xmm0,-112(%rdi)
1917         vmovd   %xmm1,-108(%rdi)
1918         vmovd   %xmm2,-104(%rdi)
1919         vmovd   %xmm3,-100(%rdi)
1920         vmovd   %xmm4,-96(%rdi)
1921         leaq    8(%r11),%rsp
1922 .cfi_def_cfa    %rsp,8
1923         vzeroupper
1924         .byte   0xf3,0xc3
1925 .cfi_endproc    
1926 .size   poly1305_blocks_avx2,.-poly1305_blocks_avx2
1927 .align  64
1928 .Lconst:
1929 .Lmask24:
1930 .long   0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1931 .L129:
1932 .long   16777216,0,16777216,0,16777216,0,16777216,0
1933 .Lmask26:
1934 .long   0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1935 .Lpermd_avx2:
1936 .long   2,2,2,3,2,0,2,1
1937 .Lpermd_avx512:
1938 .long   0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1939
1940 .L2_44_inp_permd:
1941 .long   0,1,1,2,2,3,7,7
1942 .L2_44_inp_shift:
1943 .quad   0,12,24,64
1944 .L2_44_mask:
1945 .quad   0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1946 .L2_44_shift_rgt:
1947 .quad   44,44,42,64
1948 .L2_44_shift_lft:
1949 .quad   8,8,10,64
1950
1951 .align  64
1952 .Lx_mask44:
1953 .quad   0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1954 .quad   0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1955 .Lx_mask42:
1956 .quad   0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1957 .quad   0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1958 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1959 .align  16
1960 .globl  xor128_encrypt_n_pad
1961 .type   xor128_encrypt_n_pad,@function
1962 .align  16
1963 xor128_encrypt_n_pad:
1964         subq    %rdx,%rsi
1965         subq    %rdx,%rdi
1966         movq    %rcx,%r10
1967         shrq    $4,%rcx
1968         jz      .Ltail_enc
1969         nop
1970 .Loop_enc_xmm:
1971         movdqu  (%rsi,%rdx,1),%xmm0
1972         pxor    (%rdx),%xmm0
1973         movdqu  %xmm0,(%rdi,%rdx,1)
1974         movdqa  %xmm0,(%rdx)
1975         leaq    16(%rdx),%rdx
1976         decq    %rcx
1977         jnz     .Loop_enc_xmm
1978
1979         andq    $15,%r10
1980         jz      .Ldone_enc
1981
1982 .Ltail_enc:
1983         movq    $16,%rcx
1984         subq    %r10,%rcx
1985         xorl    %eax,%eax
1986 .Loop_enc_byte:
1987         movb    (%rsi,%rdx,1),%al
1988         xorb    (%rdx),%al
1989         movb    %al,(%rdi,%rdx,1)
1990         movb    %al,(%rdx)
1991         leaq    1(%rdx),%rdx
1992         decq    %r10
1993         jnz     .Loop_enc_byte
1994
1995         xorl    %eax,%eax
1996 .Loop_enc_pad:
1997         movb    %al,(%rdx)
1998         leaq    1(%rdx),%rdx
1999         decq    %rcx
2000         jnz     .Loop_enc_pad
2001
2002 .Ldone_enc:
2003         movq    %rdx,%rax
2004         .byte   0xf3,0xc3
2005 .size   xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
2006
2007 .globl  xor128_decrypt_n_pad
2008 .type   xor128_decrypt_n_pad,@function
2009 .align  16
2010 xor128_decrypt_n_pad:
2011         subq    %rdx,%rsi
2012         subq    %rdx,%rdi
2013         movq    %rcx,%r10
2014         shrq    $4,%rcx
2015         jz      .Ltail_dec
2016         nop
2017 .Loop_dec_xmm:
2018         movdqu  (%rsi,%rdx,1),%xmm0
2019         movdqa  (%rdx),%xmm1
2020         pxor    %xmm0,%xmm1
2021         movdqu  %xmm1,(%rdi,%rdx,1)
2022         movdqa  %xmm0,(%rdx)
2023         leaq    16(%rdx),%rdx
2024         decq    %rcx
2025         jnz     .Loop_dec_xmm
2026
2027         pxor    %xmm1,%xmm1
2028         andq    $15,%r10
2029         jz      .Ldone_dec
2030
2031 .Ltail_dec:
2032         movq    $16,%rcx
2033         subq    %r10,%rcx
2034         xorl    %eax,%eax
2035         xorq    %r11,%r11
2036 .Loop_dec_byte:
2037         movb    (%rsi,%rdx,1),%r11b
2038         movb    (%rdx),%al
2039         xorb    %r11b,%al
2040         movb    %al,(%rdi,%rdx,1)
2041         movb    %r11b,(%rdx)
2042         leaq    1(%rdx),%rdx
2043         decq    %r10
2044         jnz     .Loop_dec_byte
2045
2046         xorl    %eax,%eax
2047 .Loop_dec_pad:
2048         movb    %al,(%rdx)
2049         leaq    1(%rdx),%rdx
2050         decq    %rcx
2051         jnz     .Loop_dec_pad
2052
2053 .Ldone_dec:
2054         movq    %rdx,%rax
2055         .byte   0xf3,0xc3
2056 .size   xor128_decrypt_n_pad,.-xor128_decrypt_n_pad