]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
Since contrib/libcxxrt's ancestry was never correct, subversion 1.8 and
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / amd64 / ecp_nistz256-x86_64.S
1         # $FreeBSD$
2 .text   
3
4
5
6 .align  64
7 .Lpoly:
8 .quad   0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
9
10
11 .LRR:
12 .quad   0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
13
14 .LOne:
15 .long   1,1,1,1,1,1,1,1
16 .LTwo:
17 .long   2,2,2,2,2,2,2,2
18 .LThree:
19 .long   3,3,3,3,3,3,3,3
20 .LONE_mont:
21 .quad   0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
22
23 .globl  ecp_nistz256_mul_by_2
24 .type   ecp_nistz256_mul_by_2,@function
25 .align  64
26 ecp_nistz256_mul_by_2:
27         pushq   %r12
28         pushq   %r13
29
30         movq    0(%rsi),%r8
31         movq    8(%rsi),%r9
32         addq    %r8,%r8
33         movq    16(%rsi),%r10
34         adcq    %r9,%r9
35         movq    24(%rsi),%r11
36         leaq    .Lpoly(%rip),%rsi
37         movq    %r8,%rax
38         adcq    %r10,%r10
39         adcq    %r11,%r11
40         movq    %r9,%rdx
41         sbbq    %r13,%r13
42
43         subq    0(%rsi),%r8
44         movq    %r10,%rcx
45         sbbq    8(%rsi),%r9
46         sbbq    16(%rsi),%r10
47         movq    %r11,%r12
48         sbbq    24(%rsi),%r11
49         testq   %r13,%r13
50
51         cmovzq  %rax,%r8
52         cmovzq  %rdx,%r9
53         movq    %r8,0(%rdi)
54         cmovzq  %rcx,%r10
55         movq    %r9,8(%rdi)
56         cmovzq  %r12,%r11
57         movq    %r10,16(%rdi)
58         movq    %r11,24(%rdi)
59
60         popq    %r13
61         popq    %r12
62         .byte   0xf3,0xc3
63 .size   ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
64
65
66
67 .globl  ecp_nistz256_div_by_2
68 .type   ecp_nistz256_div_by_2,@function
69 .align  32
70 ecp_nistz256_div_by_2:
71         pushq   %r12
72         pushq   %r13
73
74         movq    0(%rsi),%r8
75         movq    8(%rsi),%r9
76         movq    16(%rsi),%r10
77         movq    %r8,%rax
78         movq    24(%rsi),%r11
79         leaq    .Lpoly(%rip),%rsi
80
81         movq    %r9,%rdx
82         xorq    %r13,%r13
83         addq    0(%rsi),%r8
84         movq    %r10,%rcx
85         adcq    8(%rsi),%r9
86         adcq    16(%rsi),%r10
87         movq    %r11,%r12
88         adcq    24(%rsi),%r11
89         adcq    $0,%r13
90         xorq    %rsi,%rsi
91         testq   $1,%rax
92
93         cmovzq  %rax,%r8
94         cmovzq  %rdx,%r9
95         cmovzq  %rcx,%r10
96         cmovzq  %r12,%r11
97         cmovzq  %rsi,%r13
98
99         movq    %r9,%rax
100         shrq    $1,%r8
101         shlq    $63,%rax
102         movq    %r10,%rdx
103         shrq    $1,%r9
104         orq     %rax,%r8
105         shlq    $63,%rdx
106         movq    %r11,%rcx
107         shrq    $1,%r10
108         orq     %rdx,%r9
109         shlq    $63,%rcx
110         shrq    $1,%r11
111         shlq    $63,%r13
112         orq     %rcx,%r10
113         orq     %r13,%r11
114
115         movq    %r8,0(%rdi)
116         movq    %r9,8(%rdi)
117         movq    %r10,16(%rdi)
118         movq    %r11,24(%rdi)
119
120         popq    %r13
121         popq    %r12
122         .byte   0xf3,0xc3
123 .size   ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
124
125
126
127 .globl  ecp_nistz256_mul_by_3
128 .type   ecp_nistz256_mul_by_3,@function
129 .align  32
130 ecp_nistz256_mul_by_3:
131         pushq   %r12
132         pushq   %r13
133
134         movq    0(%rsi),%r8
135         xorq    %r13,%r13
136         movq    8(%rsi),%r9
137         addq    %r8,%r8
138         movq    16(%rsi),%r10
139         adcq    %r9,%r9
140         movq    24(%rsi),%r11
141         movq    %r8,%rax
142         adcq    %r10,%r10
143         adcq    %r11,%r11
144         movq    %r9,%rdx
145         adcq    $0,%r13
146
147         subq    $-1,%r8
148         movq    %r10,%rcx
149         sbbq    .Lpoly+8(%rip),%r9
150         sbbq    $0,%r10
151         movq    %r11,%r12
152         sbbq    .Lpoly+24(%rip),%r11
153         testq   %r13,%r13
154
155         cmovzq  %rax,%r8
156         cmovzq  %rdx,%r9
157         cmovzq  %rcx,%r10
158         cmovzq  %r12,%r11
159
160         xorq    %r13,%r13
161         addq    0(%rsi),%r8
162         adcq    8(%rsi),%r9
163         movq    %r8,%rax
164         adcq    16(%rsi),%r10
165         adcq    24(%rsi),%r11
166         movq    %r9,%rdx
167         adcq    $0,%r13
168
169         subq    $-1,%r8
170         movq    %r10,%rcx
171         sbbq    .Lpoly+8(%rip),%r9
172         sbbq    $0,%r10
173         movq    %r11,%r12
174         sbbq    .Lpoly+24(%rip),%r11
175         testq   %r13,%r13
176
177         cmovzq  %rax,%r8
178         cmovzq  %rdx,%r9
179         movq    %r8,0(%rdi)
180         cmovzq  %rcx,%r10
181         movq    %r9,8(%rdi)
182         cmovzq  %r12,%r11
183         movq    %r10,16(%rdi)
184         movq    %r11,24(%rdi)
185
186         popq    %r13
187         popq    %r12
188         .byte   0xf3,0xc3
189 .size   ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
190
191
192
193 .globl  ecp_nistz256_add
194 .type   ecp_nistz256_add,@function
195 .align  32
196 ecp_nistz256_add:
197         pushq   %r12
198         pushq   %r13
199
200         movq    0(%rsi),%r8
201         xorq    %r13,%r13
202         movq    8(%rsi),%r9
203         movq    16(%rsi),%r10
204         movq    24(%rsi),%r11
205         leaq    .Lpoly(%rip),%rsi
206
207         addq    0(%rdx),%r8
208         adcq    8(%rdx),%r9
209         movq    %r8,%rax
210         adcq    16(%rdx),%r10
211         adcq    24(%rdx),%r11
212         movq    %r9,%rdx
213         adcq    $0,%r13
214
215         subq    0(%rsi),%r8
216         movq    %r10,%rcx
217         sbbq    8(%rsi),%r9
218         sbbq    16(%rsi),%r10
219         movq    %r11,%r12
220         sbbq    24(%rsi),%r11
221         testq   %r13,%r13
222
223         cmovzq  %rax,%r8
224         cmovzq  %rdx,%r9
225         movq    %r8,0(%rdi)
226         cmovzq  %rcx,%r10
227         movq    %r9,8(%rdi)
228         cmovzq  %r12,%r11
229         movq    %r10,16(%rdi)
230         movq    %r11,24(%rdi)
231
232         popq    %r13
233         popq    %r12
234         .byte   0xf3,0xc3
235 .size   ecp_nistz256_add,.-ecp_nistz256_add
236
237
238
239 .globl  ecp_nistz256_sub
240 .type   ecp_nistz256_sub,@function
241 .align  32
242 ecp_nistz256_sub:
243         pushq   %r12
244         pushq   %r13
245
246         movq    0(%rsi),%r8
247         xorq    %r13,%r13
248         movq    8(%rsi),%r9
249         movq    16(%rsi),%r10
250         movq    24(%rsi),%r11
251         leaq    .Lpoly(%rip),%rsi
252
253         subq    0(%rdx),%r8
254         sbbq    8(%rdx),%r9
255         movq    %r8,%rax
256         sbbq    16(%rdx),%r10
257         sbbq    24(%rdx),%r11
258         movq    %r9,%rdx
259         sbbq    $0,%r13
260
261         addq    0(%rsi),%r8
262         movq    %r10,%rcx
263         adcq    8(%rsi),%r9
264         adcq    16(%rsi),%r10
265         movq    %r11,%r12
266         adcq    24(%rsi),%r11
267         testq   %r13,%r13
268
269         cmovzq  %rax,%r8
270         cmovzq  %rdx,%r9
271         movq    %r8,0(%rdi)
272         cmovzq  %rcx,%r10
273         movq    %r9,8(%rdi)
274         cmovzq  %r12,%r11
275         movq    %r10,16(%rdi)
276         movq    %r11,24(%rdi)
277
278         popq    %r13
279         popq    %r12
280         .byte   0xf3,0xc3
281 .size   ecp_nistz256_sub,.-ecp_nistz256_sub
282
283
284
285 .globl  ecp_nistz256_neg
286 .type   ecp_nistz256_neg,@function
287 .align  32
288 ecp_nistz256_neg:
289         pushq   %r12
290         pushq   %r13
291
292         xorq    %r8,%r8
293         xorq    %r9,%r9
294         xorq    %r10,%r10
295         xorq    %r11,%r11
296         xorq    %r13,%r13
297
298         subq    0(%rsi),%r8
299         sbbq    8(%rsi),%r9
300         sbbq    16(%rsi),%r10
301         movq    %r8,%rax
302         sbbq    24(%rsi),%r11
303         leaq    .Lpoly(%rip),%rsi
304         movq    %r9,%rdx
305         sbbq    $0,%r13
306
307         addq    0(%rsi),%r8
308         movq    %r10,%rcx
309         adcq    8(%rsi),%r9
310         adcq    16(%rsi),%r10
311         movq    %r11,%r12
312         adcq    24(%rsi),%r11
313         testq   %r13,%r13
314
315         cmovzq  %rax,%r8
316         cmovzq  %rdx,%r9
317         movq    %r8,0(%rdi)
318         cmovzq  %rcx,%r10
319         movq    %r9,8(%rdi)
320         cmovzq  %r12,%r11
321         movq    %r10,16(%rdi)
322         movq    %r11,24(%rdi)
323
324         popq    %r13
325         popq    %r12
326         .byte   0xf3,0xc3
327 .size   ecp_nistz256_neg,.-ecp_nistz256_neg
328
329
330
331
332 .globl  ecp_nistz256_to_mont
333 .type   ecp_nistz256_to_mont,@function
334 .align  32
335 ecp_nistz256_to_mont:
336         leaq    .LRR(%rip),%rdx
337         jmp     .Lmul_mont
338 .size   ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
339
340
341
342
343
344
345
346 .globl  ecp_nistz256_mul_mont
347 .type   ecp_nistz256_mul_mont,@function
348 .align  32
349 ecp_nistz256_mul_mont:
350 .Lmul_mont:
351         pushq   %rbp
352         pushq   %rbx
353         pushq   %r12
354         pushq   %r13
355         pushq   %r14
356         pushq   %r15
357         movq    %rdx,%rbx
358         movq    0(%rdx),%rax
359         movq    0(%rsi),%r9
360         movq    8(%rsi),%r10
361         movq    16(%rsi),%r11
362         movq    24(%rsi),%r12
363
364         call    __ecp_nistz256_mul_montq
365 .Lmul_mont_done:
366         popq    %r15
367         popq    %r14
368         popq    %r13
369         popq    %r12
370         popq    %rbx
371         popq    %rbp
372         .byte   0xf3,0xc3
373 .size   ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
374
375 .type   __ecp_nistz256_mul_montq,@function
376 .align  32
377 __ecp_nistz256_mul_montq:
378
379
380         movq    %rax,%rbp
381         mulq    %r9
382         movq    .Lpoly+8(%rip),%r14
383         movq    %rax,%r8
384         movq    %rbp,%rax
385         movq    %rdx,%r9
386
387         mulq    %r10
388         movq    .Lpoly+24(%rip),%r15
389         addq    %rax,%r9
390         movq    %rbp,%rax
391         adcq    $0,%rdx
392         movq    %rdx,%r10
393
394         mulq    %r11
395         addq    %rax,%r10
396         movq    %rbp,%rax
397         adcq    $0,%rdx
398         movq    %rdx,%r11
399
400         mulq    %r12
401         addq    %rax,%r11
402         movq    %r8,%rax
403         adcq    $0,%rdx
404         xorq    %r13,%r13
405         movq    %rdx,%r12
406
407
408
409
410
411
412
413
414
415
416         movq    %r8,%rbp
417         shlq    $32,%r8
418         mulq    %r15
419         shrq    $32,%rbp
420         addq    %r8,%r9
421         adcq    %rbp,%r10
422         adcq    %rax,%r11
423         movq    8(%rbx),%rax
424         adcq    %rdx,%r12
425         adcq    $0,%r13
426         xorq    %r8,%r8
427
428
429
430         movq    %rax,%rbp
431         mulq    0(%rsi)
432         addq    %rax,%r9
433         movq    %rbp,%rax
434         adcq    $0,%rdx
435         movq    %rdx,%rcx
436
437         mulq    8(%rsi)
438         addq    %rcx,%r10
439         adcq    $0,%rdx
440         addq    %rax,%r10
441         movq    %rbp,%rax
442         adcq    $0,%rdx
443         movq    %rdx,%rcx
444
445         mulq    16(%rsi)
446         addq    %rcx,%r11
447         adcq    $0,%rdx
448         addq    %rax,%r11
449         movq    %rbp,%rax
450         adcq    $0,%rdx
451         movq    %rdx,%rcx
452
453         mulq    24(%rsi)
454         addq    %rcx,%r12
455         adcq    $0,%rdx
456         addq    %rax,%r12
457         movq    %r9,%rax
458         adcq    %rdx,%r13
459         adcq    $0,%r8
460
461
462
463         movq    %r9,%rbp
464         shlq    $32,%r9
465         mulq    %r15
466         shrq    $32,%rbp
467         addq    %r9,%r10
468         adcq    %rbp,%r11
469         adcq    %rax,%r12
470         movq    16(%rbx),%rax
471         adcq    %rdx,%r13
472         adcq    $0,%r8
473         xorq    %r9,%r9
474
475
476
477         movq    %rax,%rbp
478         mulq    0(%rsi)
479         addq    %rax,%r10
480         movq    %rbp,%rax
481         adcq    $0,%rdx
482         movq    %rdx,%rcx
483
484         mulq    8(%rsi)
485         addq    %rcx,%r11
486         adcq    $0,%rdx
487         addq    %rax,%r11
488         movq    %rbp,%rax
489         adcq    $0,%rdx
490         movq    %rdx,%rcx
491
492         mulq    16(%rsi)
493         addq    %rcx,%r12
494         adcq    $0,%rdx
495         addq    %rax,%r12
496         movq    %rbp,%rax
497         adcq    $0,%rdx
498         movq    %rdx,%rcx
499
500         mulq    24(%rsi)
501         addq    %rcx,%r13
502         adcq    $0,%rdx
503         addq    %rax,%r13
504         movq    %r10,%rax
505         adcq    %rdx,%r8
506         adcq    $0,%r9
507
508
509
510         movq    %r10,%rbp
511         shlq    $32,%r10
512         mulq    %r15
513         shrq    $32,%rbp
514         addq    %r10,%r11
515         adcq    %rbp,%r12
516         adcq    %rax,%r13
517         movq    24(%rbx),%rax
518         adcq    %rdx,%r8
519         adcq    $0,%r9
520         xorq    %r10,%r10
521
522
523
524         movq    %rax,%rbp
525         mulq    0(%rsi)
526         addq    %rax,%r11
527         movq    %rbp,%rax
528         adcq    $0,%rdx
529         movq    %rdx,%rcx
530
531         mulq    8(%rsi)
532         addq    %rcx,%r12
533         adcq    $0,%rdx
534         addq    %rax,%r12
535         movq    %rbp,%rax
536         adcq    $0,%rdx
537         movq    %rdx,%rcx
538
539         mulq    16(%rsi)
540         addq    %rcx,%r13
541         adcq    $0,%rdx
542         addq    %rax,%r13
543         movq    %rbp,%rax
544         adcq    $0,%rdx
545         movq    %rdx,%rcx
546
547         mulq    24(%rsi)
548         addq    %rcx,%r8
549         adcq    $0,%rdx
550         addq    %rax,%r8
551         movq    %r11,%rax
552         adcq    %rdx,%r9
553         adcq    $0,%r10
554
555
556
557         movq    %r11,%rbp
558         shlq    $32,%r11
559         mulq    %r15
560         shrq    $32,%rbp
561         addq    %r11,%r12
562         adcq    %rbp,%r13
563         movq    %r12,%rcx
564         adcq    %rax,%r8
565         adcq    %rdx,%r9
566         movq    %r13,%rbp
567         adcq    $0,%r10
568
569
570
571         subq    $-1,%r12
572         movq    %r8,%rbx
573         sbbq    %r14,%r13
574         sbbq    $0,%r8
575         movq    %r9,%rdx
576         sbbq    %r15,%r9
577         sbbq    $0,%r10
578
579         cmovcq  %rcx,%r12
580         cmovcq  %rbp,%r13
581         movq    %r12,0(%rdi)
582         cmovcq  %rbx,%r8
583         movq    %r13,8(%rdi)
584         cmovcq  %rdx,%r9
585         movq    %r8,16(%rdi)
586         movq    %r9,24(%rdi)
587
588         .byte   0xf3,0xc3
589 .size   __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
590
591
592
593
594
595
596
597
598 .globl  ecp_nistz256_sqr_mont
599 .type   ecp_nistz256_sqr_mont,@function
600 .align  32
601 ecp_nistz256_sqr_mont:
602         pushq   %rbp
603         pushq   %rbx
604         pushq   %r12
605         pushq   %r13
606         pushq   %r14
607         pushq   %r15
608         movq    0(%rsi),%rax
609         movq    8(%rsi),%r14
610         movq    16(%rsi),%r15
611         movq    24(%rsi),%r8
612
613         call    __ecp_nistz256_sqr_montq
614 .Lsqr_mont_done:
615         popq    %r15
616         popq    %r14
617         popq    %r13
618         popq    %r12
619         popq    %rbx
620         popq    %rbp
621         .byte   0xf3,0xc3
622 .size   ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
623
624 .type   __ecp_nistz256_sqr_montq,@function
625 .align  32
626 __ecp_nistz256_sqr_montq:
627         movq    %rax,%r13
628         mulq    %r14
629         movq    %rax,%r9
630         movq    %r15,%rax
631         movq    %rdx,%r10
632
633         mulq    %r13
634         addq    %rax,%r10
635         movq    %r8,%rax
636         adcq    $0,%rdx
637         movq    %rdx,%r11
638
639         mulq    %r13
640         addq    %rax,%r11
641         movq    %r15,%rax
642         adcq    $0,%rdx
643         movq    %rdx,%r12
644
645
646         mulq    %r14
647         addq    %rax,%r11
648         movq    %r8,%rax
649         adcq    $0,%rdx
650         movq    %rdx,%rbp
651
652         mulq    %r14
653         addq    %rax,%r12
654         movq    %r8,%rax
655         adcq    $0,%rdx
656         addq    %rbp,%r12
657         movq    %rdx,%r13
658         adcq    $0,%r13
659
660
661         mulq    %r15
662         xorq    %r15,%r15
663         addq    %rax,%r13
664         movq    0(%rsi),%rax
665         movq    %rdx,%r14
666         adcq    $0,%r14
667
668         addq    %r9,%r9
669         adcq    %r10,%r10
670         adcq    %r11,%r11
671         adcq    %r12,%r12
672         adcq    %r13,%r13
673         adcq    %r14,%r14
674         adcq    $0,%r15
675
676         mulq    %rax
677         movq    %rax,%r8
678         movq    8(%rsi),%rax
679         movq    %rdx,%rcx
680
681         mulq    %rax
682         addq    %rcx,%r9
683         adcq    %rax,%r10
684         movq    16(%rsi),%rax
685         adcq    $0,%rdx
686         movq    %rdx,%rcx
687
688         mulq    %rax
689         addq    %rcx,%r11
690         adcq    %rax,%r12
691         movq    24(%rsi),%rax
692         adcq    $0,%rdx
693         movq    %rdx,%rcx
694
695         mulq    %rax
696         addq    %rcx,%r13
697         adcq    %rax,%r14
698         movq    %r8,%rax
699         adcq    %rdx,%r15
700
701         movq    .Lpoly+8(%rip),%rsi
702         movq    .Lpoly+24(%rip),%rbp
703
704
705
706
707         movq    %r8,%rcx
708         shlq    $32,%r8
709         mulq    %rbp
710         shrq    $32,%rcx
711         addq    %r8,%r9
712         adcq    %rcx,%r10
713         adcq    %rax,%r11
714         movq    %r9,%rax
715         adcq    $0,%rdx
716
717
718
719         movq    %r9,%rcx
720         shlq    $32,%r9
721         movq    %rdx,%r8
722         mulq    %rbp
723         shrq    $32,%rcx
724         addq    %r9,%r10
725         adcq    %rcx,%r11
726         adcq    %rax,%r8
727         movq    %r10,%rax
728         adcq    $0,%rdx
729
730
731
732         movq    %r10,%rcx
733         shlq    $32,%r10
734         movq    %rdx,%r9
735         mulq    %rbp
736         shrq    $32,%rcx
737         addq    %r10,%r11
738         adcq    %rcx,%r8
739         adcq    %rax,%r9
740         movq    %r11,%rax
741         adcq    $0,%rdx
742
743
744
745         movq    %r11,%rcx
746         shlq    $32,%r11
747         movq    %rdx,%r10
748         mulq    %rbp
749         shrq    $32,%rcx
750         addq    %r11,%r8
751         adcq    %rcx,%r9
752         adcq    %rax,%r10
753         adcq    $0,%rdx
754         xorq    %r11,%r11
755
756
757
758         addq    %r8,%r12
759         adcq    %r9,%r13
760         movq    %r12,%r8
761         adcq    %r10,%r14
762         adcq    %rdx,%r15
763         movq    %r13,%r9
764         adcq    $0,%r11
765
766         subq    $-1,%r12
767         movq    %r14,%r10
768         sbbq    %rsi,%r13
769         sbbq    $0,%r14
770         movq    %r15,%rcx
771         sbbq    %rbp,%r15
772         sbbq    $0,%r11
773
774         cmovcq  %r8,%r12
775         cmovcq  %r9,%r13
776         movq    %r12,0(%rdi)
777         cmovcq  %r10,%r14
778         movq    %r13,8(%rdi)
779         cmovcq  %rcx,%r15
780         movq    %r14,16(%rdi)
781         movq    %r15,24(%rdi)
782
783         .byte   0xf3,0xc3
784 .size   __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
785
786
787
788
789
790
791 .globl  ecp_nistz256_from_mont
792 .type   ecp_nistz256_from_mont,@function
793 .align  32
794 ecp_nistz256_from_mont:
795         pushq   %r12
796         pushq   %r13
797
798         movq    0(%rsi),%rax
799         movq    .Lpoly+24(%rip),%r13
800         movq    8(%rsi),%r9
801         movq    16(%rsi),%r10
802         movq    24(%rsi),%r11
803         movq    %rax,%r8
804         movq    .Lpoly+8(%rip),%r12
805
806
807
808         movq    %rax,%rcx
809         shlq    $32,%r8
810         mulq    %r13
811         shrq    $32,%rcx
812         addq    %r8,%r9
813         adcq    %rcx,%r10
814         adcq    %rax,%r11
815         movq    %r9,%rax
816         adcq    $0,%rdx
817
818
819
820         movq    %r9,%rcx
821         shlq    $32,%r9
822         movq    %rdx,%r8
823         mulq    %r13
824         shrq    $32,%rcx
825         addq    %r9,%r10
826         adcq    %rcx,%r11
827         adcq    %rax,%r8
828         movq    %r10,%rax
829         adcq    $0,%rdx
830
831
832
833         movq    %r10,%rcx
834         shlq    $32,%r10
835         movq    %rdx,%r9
836         mulq    %r13
837         shrq    $32,%rcx
838         addq    %r10,%r11
839         adcq    %rcx,%r8
840         adcq    %rax,%r9
841         movq    %r11,%rax
842         adcq    $0,%rdx
843
844
845
846         movq    %r11,%rcx
847         shlq    $32,%r11
848         movq    %rdx,%r10
849         mulq    %r13
850         shrq    $32,%rcx
851         addq    %r11,%r8
852         adcq    %rcx,%r9
853         movq    %r8,%rcx
854         adcq    %rax,%r10
855         movq    %r9,%rsi
856         adcq    $0,%rdx
857
858
859
860         subq    $-1,%r8
861         movq    %r10,%rax
862         sbbq    %r12,%r9
863         sbbq    $0,%r10
864         movq    %rdx,%r11
865         sbbq    %r13,%rdx
866         sbbq    %r13,%r13
867
868         cmovnzq %rcx,%r8
869         cmovnzq %rsi,%r9
870         movq    %r8,0(%rdi)
871         cmovnzq %rax,%r10
872         movq    %r9,8(%rdi)
873         cmovzq  %rdx,%r11
874         movq    %r10,16(%rdi)
875         movq    %r11,24(%rdi)
876
877         popq    %r13
878         popq    %r12
879         .byte   0xf3,0xc3
880 .size   ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
881
882
883 .globl  ecp_nistz256_select_w5
884 .type   ecp_nistz256_select_w5,@function
885 .align  32
886 ecp_nistz256_select_w5:
887         movdqa  .LOne(%rip),%xmm0
888         movd    %edx,%xmm1
889
890         pxor    %xmm2,%xmm2
891         pxor    %xmm3,%xmm3
892         pxor    %xmm4,%xmm4
893         pxor    %xmm5,%xmm5
894         pxor    %xmm6,%xmm6
895         pxor    %xmm7,%xmm7
896
897         movdqa  %xmm0,%xmm8
898         pshufd  $0,%xmm1,%xmm1
899
900         movq    $16,%rax
901 .Lselect_loop_sse_w5:
902
903         movdqa  %xmm8,%xmm15
904         paddd   %xmm0,%xmm8
905         pcmpeqd %xmm1,%xmm15
906
907         movdqa  0(%rsi),%xmm9
908         movdqa  16(%rsi),%xmm10
909         movdqa  32(%rsi),%xmm11
910         movdqa  48(%rsi),%xmm12
911         movdqa  64(%rsi),%xmm13
912         movdqa  80(%rsi),%xmm14
913         leaq    96(%rsi),%rsi
914
915         pand    %xmm15,%xmm9
916         pand    %xmm15,%xmm10
917         por     %xmm9,%xmm2
918         pand    %xmm15,%xmm11
919         por     %xmm10,%xmm3
920         pand    %xmm15,%xmm12
921         por     %xmm11,%xmm4
922         pand    %xmm15,%xmm13
923         por     %xmm12,%xmm5
924         pand    %xmm15,%xmm14
925         por     %xmm13,%xmm6
926         por     %xmm14,%xmm7
927
928         decq    %rax
929         jnz     .Lselect_loop_sse_w5
930
931         movdqu  %xmm2,0(%rdi)
932         movdqu  %xmm3,16(%rdi)
933         movdqu  %xmm4,32(%rdi)
934         movdqu  %xmm5,48(%rdi)
935         movdqu  %xmm6,64(%rdi)
936         movdqu  %xmm7,80(%rdi)
937         .byte   0xf3,0xc3
938 .size   ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
939
940
941
942 .globl  ecp_nistz256_select_w7
943 .type   ecp_nistz256_select_w7,@function
944 .align  32
945 ecp_nistz256_select_w7:
946         movdqa  .LOne(%rip),%xmm8
947         movd    %edx,%xmm1
948
949         pxor    %xmm2,%xmm2
950         pxor    %xmm3,%xmm3
951         pxor    %xmm4,%xmm4
952         pxor    %xmm5,%xmm5
953
954         movdqa  %xmm8,%xmm0
955         pshufd  $0,%xmm1,%xmm1
956         movq    $64,%rax
957
958 .Lselect_loop_sse_w7:
959         movdqa  %xmm8,%xmm15
960         paddd   %xmm0,%xmm8
961         movdqa  0(%rsi),%xmm9
962         movdqa  16(%rsi),%xmm10
963         pcmpeqd %xmm1,%xmm15
964         movdqa  32(%rsi),%xmm11
965         movdqa  48(%rsi),%xmm12
966         leaq    64(%rsi),%rsi
967
968         pand    %xmm15,%xmm9
969         pand    %xmm15,%xmm10
970         por     %xmm9,%xmm2
971         pand    %xmm15,%xmm11
972         por     %xmm10,%xmm3
973         pand    %xmm15,%xmm12
974         por     %xmm11,%xmm4
975         prefetcht0      255(%rsi)
976         por     %xmm12,%xmm5
977
978         decq    %rax
979         jnz     .Lselect_loop_sse_w7
980
981         movdqu  %xmm2,0(%rdi)
982         movdqu  %xmm3,16(%rdi)
983         movdqu  %xmm4,32(%rdi)
984         movdqu  %xmm5,48(%rdi)
985         .byte   0xf3,0xc3
986 .size   ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
987 .globl  ecp_nistz256_avx2_select_w7
988 .type   ecp_nistz256_avx2_select_w7,@function
989 .align  32
990 ecp_nistz256_avx2_select_w7:
991 .byte   0x0f,0x0b
992         .byte   0xf3,0xc3
993 .size   ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
994 .type   __ecp_nistz256_add_toq,@function
995 .align  32
996 __ecp_nistz256_add_toq:
997         addq    0(%rbx),%r12
998         adcq    8(%rbx),%r13
999         movq    %r12,%rax
1000         adcq    16(%rbx),%r8
1001         adcq    24(%rbx),%r9
1002         movq    %r13,%rbp
1003         sbbq    %r11,%r11
1004
1005         subq    $-1,%r12
1006         movq    %r8,%rcx
1007         sbbq    %r14,%r13
1008         sbbq    $0,%r8
1009         movq    %r9,%r10
1010         sbbq    %r15,%r9
1011         testq   %r11,%r11
1012
1013         cmovzq  %rax,%r12
1014         cmovzq  %rbp,%r13
1015         movq    %r12,0(%rdi)
1016         cmovzq  %rcx,%r8
1017         movq    %r13,8(%rdi)
1018         cmovzq  %r10,%r9
1019         movq    %r8,16(%rdi)
1020         movq    %r9,24(%rdi)
1021
1022         .byte   0xf3,0xc3
1023 .size   __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1024
1025 .type   __ecp_nistz256_sub_fromq,@function
1026 .align  32
1027 __ecp_nistz256_sub_fromq:
1028         subq    0(%rbx),%r12
1029         sbbq    8(%rbx),%r13
1030         movq    %r12,%rax
1031         sbbq    16(%rbx),%r8
1032         sbbq    24(%rbx),%r9
1033         movq    %r13,%rbp
1034         sbbq    %r11,%r11
1035
1036         addq    $-1,%r12
1037         movq    %r8,%rcx
1038         adcq    %r14,%r13
1039         adcq    $0,%r8
1040         movq    %r9,%r10
1041         adcq    %r15,%r9
1042         testq   %r11,%r11
1043
1044         cmovzq  %rax,%r12
1045         cmovzq  %rbp,%r13
1046         movq    %r12,0(%rdi)
1047         cmovzq  %rcx,%r8
1048         movq    %r13,8(%rdi)
1049         cmovzq  %r10,%r9
1050         movq    %r8,16(%rdi)
1051         movq    %r9,24(%rdi)
1052
1053         .byte   0xf3,0xc3
1054 .size   __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1055
1056 .type   __ecp_nistz256_subq,@function
1057 .align  32
1058 __ecp_nistz256_subq:
1059         subq    %r12,%rax
1060         sbbq    %r13,%rbp
1061         movq    %rax,%r12
1062         sbbq    %r8,%rcx
1063         sbbq    %r9,%r10
1064         movq    %rbp,%r13
1065         sbbq    %r11,%r11
1066
1067         addq    $-1,%rax
1068         movq    %rcx,%r8
1069         adcq    %r14,%rbp
1070         adcq    $0,%rcx
1071         movq    %r10,%r9
1072         adcq    %r15,%r10
1073         testq   %r11,%r11
1074
1075         cmovnzq %rax,%r12
1076         cmovnzq %rbp,%r13
1077         cmovnzq %rcx,%r8
1078         cmovnzq %r10,%r9
1079
1080         .byte   0xf3,0xc3
1081 .size   __ecp_nistz256_subq,.-__ecp_nistz256_subq
1082
1083 .type   __ecp_nistz256_mul_by_2q,@function
1084 .align  32
1085 __ecp_nistz256_mul_by_2q:
1086         addq    %r12,%r12
1087         adcq    %r13,%r13
1088         movq    %r12,%rax
1089         adcq    %r8,%r8
1090         adcq    %r9,%r9
1091         movq    %r13,%rbp
1092         sbbq    %r11,%r11
1093
1094         subq    $-1,%r12
1095         movq    %r8,%rcx
1096         sbbq    %r14,%r13
1097         sbbq    $0,%r8
1098         movq    %r9,%r10
1099         sbbq    %r15,%r9
1100         testq   %r11,%r11
1101
1102         cmovzq  %rax,%r12
1103         cmovzq  %rbp,%r13
1104         movq    %r12,0(%rdi)
1105         cmovzq  %rcx,%r8
1106         movq    %r13,8(%rdi)
1107         cmovzq  %r10,%r9
1108         movq    %r8,16(%rdi)
1109         movq    %r9,24(%rdi)
1110
1111         .byte   0xf3,0xc3
1112 .size   __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1113 .globl  ecp_nistz256_point_double
1114 .type   ecp_nistz256_point_double,@function
1115 .align  32
1116 ecp_nistz256_point_double:
1117         pushq   %rbp
1118         pushq   %rbx
1119         pushq   %r12
1120         pushq   %r13
1121         pushq   %r14
1122         pushq   %r15
1123         subq    $160+8,%rsp
1124
1125 .Lpoint_double_shortcutq:
1126         movdqu  0(%rsi),%xmm0
1127         movq    %rsi,%rbx
1128         movdqu  16(%rsi),%xmm1
1129         movq    32+0(%rsi),%r12
1130         movq    32+8(%rsi),%r13
1131         movq    32+16(%rsi),%r8
1132         movq    32+24(%rsi),%r9
1133         movq    .Lpoly+8(%rip),%r14
1134         movq    .Lpoly+24(%rip),%r15
1135         movdqa  %xmm0,96(%rsp)
1136         movdqa  %xmm1,96+16(%rsp)
1137         leaq    32(%rdi),%r10
1138         leaq    64(%rdi),%r11
1139 .byte   102,72,15,110,199
1140 .byte   102,73,15,110,202
1141 .byte   102,73,15,110,211
1142
1143         leaq    0(%rsp),%rdi
1144         call    __ecp_nistz256_mul_by_2q
1145
1146         movq    64+0(%rsi),%rax
1147         movq    64+8(%rsi),%r14
1148         movq    64+16(%rsi),%r15
1149         movq    64+24(%rsi),%r8
1150         leaq    64-0(%rsi),%rsi
1151         leaq    64(%rsp),%rdi
1152         call    __ecp_nistz256_sqr_montq
1153
1154         movq    0+0(%rsp),%rax
1155         movq    8+0(%rsp),%r14
1156         leaq    0+0(%rsp),%rsi
1157         movq    16+0(%rsp),%r15
1158         movq    24+0(%rsp),%r8
1159         leaq    0(%rsp),%rdi
1160         call    __ecp_nistz256_sqr_montq
1161
1162         movq    32(%rbx),%rax
1163         movq    64+0(%rbx),%r9
1164         movq    64+8(%rbx),%r10
1165         movq    64+16(%rbx),%r11
1166         movq    64+24(%rbx),%r12
1167         leaq    64-0(%rbx),%rsi
1168         leaq    32(%rbx),%rbx
1169 .byte   102,72,15,126,215
1170         call    __ecp_nistz256_mul_montq
1171         call    __ecp_nistz256_mul_by_2q
1172
1173         movq    96+0(%rsp),%r12
1174         movq    96+8(%rsp),%r13
1175         leaq    64(%rsp),%rbx
1176         movq    96+16(%rsp),%r8
1177         movq    96+24(%rsp),%r9
1178         leaq    32(%rsp),%rdi
1179         call    __ecp_nistz256_add_toq
1180
1181         movq    96+0(%rsp),%r12
1182         movq    96+8(%rsp),%r13
1183         leaq    64(%rsp),%rbx
1184         movq    96+16(%rsp),%r8
1185         movq    96+24(%rsp),%r9
1186         leaq    64(%rsp),%rdi
1187         call    __ecp_nistz256_sub_fromq
1188
1189         movq    0+0(%rsp),%rax
1190         movq    8+0(%rsp),%r14
1191         leaq    0+0(%rsp),%rsi
1192         movq    16+0(%rsp),%r15
1193         movq    24+0(%rsp),%r8
1194 .byte   102,72,15,126,207
1195         call    __ecp_nistz256_sqr_montq
1196         xorq    %r9,%r9
1197         movq    %r12,%rax
1198         addq    $-1,%r12
1199         movq    %r13,%r10
1200         adcq    %rsi,%r13
1201         movq    %r14,%rcx
1202         adcq    $0,%r14
1203         movq    %r15,%r8
1204         adcq    %rbp,%r15
1205         adcq    $0,%r9
1206         xorq    %rsi,%rsi
1207         testq   $1,%rax
1208
1209         cmovzq  %rax,%r12
1210         cmovzq  %r10,%r13
1211         cmovzq  %rcx,%r14
1212         cmovzq  %r8,%r15
1213         cmovzq  %rsi,%r9
1214
1215         movq    %r13,%rax
1216         shrq    $1,%r12
1217         shlq    $63,%rax
1218         movq    %r14,%r10
1219         shrq    $1,%r13
1220         orq     %rax,%r12
1221         shlq    $63,%r10
1222         movq    %r15,%rcx
1223         shrq    $1,%r14
1224         orq     %r10,%r13
1225         shlq    $63,%rcx
1226         movq    %r12,0(%rdi)
1227         shrq    $1,%r15
1228         movq    %r13,8(%rdi)
1229         shlq    $63,%r9
1230         orq     %rcx,%r14
1231         orq     %r9,%r15
1232         movq    %r14,16(%rdi)
1233         movq    %r15,24(%rdi)
1234         movq    64(%rsp),%rax
1235         leaq    64(%rsp),%rbx
1236         movq    0+32(%rsp),%r9
1237         movq    8+32(%rsp),%r10
1238         leaq    0+32(%rsp),%rsi
1239         movq    16+32(%rsp),%r11
1240         movq    24+32(%rsp),%r12
1241         leaq    32(%rsp),%rdi
1242         call    __ecp_nistz256_mul_montq
1243
1244         leaq    128(%rsp),%rdi
1245         call    __ecp_nistz256_mul_by_2q
1246
1247         leaq    32(%rsp),%rbx
1248         leaq    32(%rsp),%rdi
1249         call    __ecp_nistz256_add_toq
1250
1251         movq    96(%rsp),%rax
1252         leaq    96(%rsp),%rbx
1253         movq    0+0(%rsp),%r9
1254         movq    8+0(%rsp),%r10
1255         leaq    0+0(%rsp),%rsi
1256         movq    16+0(%rsp),%r11
1257         movq    24+0(%rsp),%r12
1258         leaq    0(%rsp),%rdi
1259         call    __ecp_nistz256_mul_montq
1260
1261         leaq    128(%rsp),%rdi
1262         call    __ecp_nistz256_mul_by_2q
1263
1264         movq    0+32(%rsp),%rax
1265         movq    8+32(%rsp),%r14
1266         leaq    0+32(%rsp),%rsi
1267         movq    16+32(%rsp),%r15
1268         movq    24+32(%rsp),%r8
1269 .byte   102,72,15,126,199
1270         call    __ecp_nistz256_sqr_montq
1271
1272         leaq    128(%rsp),%rbx
1273         movq    %r14,%r8
1274         movq    %r15,%r9
1275         movq    %rsi,%r14
1276         movq    %rbp,%r15
1277         call    __ecp_nistz256_sub_fromq
1278
1279         movq    0+0(%rsp),%rax
1280         movq    0+8(%rsp),%rbp
1281         movq    0+16(%rsp),%rcx
1282         movq    0+24(%rsp),%r10
1283         leaq    0(%rsp),%rdi
1284         call    __ecp_nistz256_subq
1285
1286         movq    32(%rsp),%rax
1287         leaq    32(%rsp),%rbx
1288         movq    %r12,%r14
1289         xorl    %ecx,%ecx
1290         movq    %r12,0+0(%rsp)
1291         movq    %r13,%r10
1292         movq    %r13,0+8(%rsp)
1293         cmovzq  %r8,%r11
1294         movq    %r8,0+16(%rsp)
1295         leaq    0-0(%rsp),%rsi
1296         cmovzq  %r9,%r12
1297         movq    %r9,0+24(%rsp)
1298         movq    %r14,%r9
1299         leaq    0(%rsp),%rdi
1300         call    __ecp_nistz256_mul_montq
1301
1302 .byte   102,72,15,126,203
1303 .byte   102,72,15,126,207
1304         call    __ecp_nistz256_sub_fromq
1305
1306         addq    $160+8,%rsp
1307         popq    %r15
1308         popq    %r14
1309         popq    %r13
1310         popq    %r12
1311         popq    %rbx
1312         popq    %rbp
1313         .byte   0xf3,0xc3
1314 .size   ecp_nistz256_point_double,.-ecp_nistz256_point_double
1315 .globl  ecp_nistz256_point_add
1316 .type   ecp_nistz256_point_add,@function
1317 .align  32
1318 ecp_nistz256_point_add:
1319         pushq   %rbp
1320         pushq   %rbx
1321         pushq   %r12
1322         pushq   %r13
1323         pushq   %r14
1324         pushq   %r15
1325         subq    $576+8,%rsp
1326
1327         movdqu  0(%rsi),%xmm0
1328         movdqu  16(%rsi),%xmm1
1329         movdqu  32(%rsi),%xmm2
1330         movdqu  48(%rsi),%xmm3
1331         movdqu  64(%rsi),%xmm4
1332         movdqu  80(%rsi),%xmm5
1333         movq    %rsi,%rbx
1334         movq    %rdx,%rsi
1335         movdqa  %xmm0,384(%rsp)
1336         movdqa  %xmm1,384+16(%rsp)
1337         por     %xmm0,%xmm1
1338         movdqa  %xmm2,416(%rsp)
1339         movdqa  %xmm3,416+16(%rsp)
1340         por     %xmm2,%xmm3
1341         movdqa  %xmm4,448(%rsp)
1342         movdqa  %xmm5,448+16(%rsp)
1343         por     %xmm1,%xmm3
1344
1345         movdqu  0(%rsi),%xmm0
1346         pshufd  $0xb1,%xmm3,%xmm5
1347         movdqu  16(%rsi),%xmm1
1348         movdqu  32(%rsi),%xmm2
1349         por     %xmm3,%xmm5
1350         movdqu  48(%rsi),%xmm3
1351         movq    64+0(%rsi),%rax
1352         movq    64+8(%rsi),%r14
1353         movq    64+16(%rsi),%r15
1354         movq    64+24(%rsi),%r8
1355         movdqa  %xmm0,480(%rsp)
1356         pshufd  $0x1e,%xmm5,%xmm4
1357         movdqa  %xmm1,480+16(%rsp)
1358         por     %xmm0,%xmm1
1359 .byte   102,72,15,110,199
1360         movdqa  %xmm2,512(%rsp)
1361         movdqa  %xmm3,512+16(%rsp)
1362         por     %xmm2,%xmm3
1363         por     %xmm4,%xmm5
1364         pxor    %xmm4,%xmm4
1365         por     %xmm1,%xmm3
1366
1367         leaq    64-0(%rsi),%rsi
1368         movq    %rax,544+0(%rsp)
1369         movq    %r14,544+8(%rsp)
1370         movq    %r15,544+16(%rsp)
1371         movq    %r8,544+24(%rsp)
1372         leaq    96(%rsp),%rdi
1373         call    __ecp_nistz256_sqr_montq
1374
1375         pcmpeqd %xmm4,%xmm5
1376         pshufd  $0xb1,%xmm3,%xmm4
1377         por     %xmm3,%xmm4
1378         pshufd  $0,%xmm5,%xmm5
1379         pshufd  $0x1e,%xmm4,%xmm3
1380         por     %xmm3,%xmm4
1381         pxor    %xmm3,%xmm3
1382         pcmpeqd %xmm3,%xmm4
1383         pshufd  $0,%xmm4,%xmm4
1384         movq    64+0(%rbx),%rax
1385         movq    64+8(%rbx),%r14
1386         movq    64+16(%rbx),%r15
1387         movq    64+24(%rbx),%r8
1388 .byte   102,72,15,110,203
1389
1390         leaq    64-0(%rbx),%rsi
1391         leaq    32(%rsp),%rdi
1392         call    __ecp_nistz256_sqr_montq
1393
1394         movq    544(%rsp),%rax
1395         leaq    544(%rsp),%rbx
1396         movq    0+96(%rsp),%r9
1397         movq    8+96(%rsp),%r10
1398         leaq    0+96(%rsp),%rsi
1399         movq    16+96(%rsp),%r11
1400         movq    24+96(%rsp),%r12
1401         leaq    224(%rsp),%rdi
1402         call    __ecp_nistz256_mul_montq
1403
1404         movq    448(%rsp),%rax
1405         leaq    448(%rsp),%rbx
1406         movq    0+32(%rsp),%r9
1407         movq    8+32(%rsp),%r10
1408         leaq    0+32(%rsp),%rsi
1409         movq    16+32(%rsp),%r11
1410         movq    24+32(%rsp),%r12
1411         leaq    256(%rsp),%rdi
1412         call    __ecp_nistz256_mul_montq
1413
1414         movq    416(%rsp),%rax
1415         leaq    416(%rsp),%rbx
1416         movq    0+224(%rsp),%r9
1417         movq    8+224(%rsp),%r10
1418         leaq    0+224(%rsp),%rsi
1419         movq    16+224(%rsp),%r11
1420         movq    24+224(%rsp),%r12
1421         leaq    224(%rsp),%rdi
1422         call    __ecp_nistz256_mul_montq
1423
1424         movq    512(%rsp),%rax
1425         leaq    512(%rsp),%rbx
1426         movq    0+256(%rsp),%r9
1427         movq    8+256(%rsp),%r10
1428         leaq    0+256(%rsp),%rsi
1429         movq    16+256(%rsp),%r11
1430         movq    24+256(%rsp),%r12
1431         leaq    256(%rsp),%rdi
1432         call    __ecp_nistz256_mul_montq
1433
1434         leaq    224(%rsp),%rbx
1435         leaq    64(%rsp),%rdi
1436         call    __ecp_nistz256_sub_fromq
1437
1438         orq     %r13,%r12
1439         movdqa  %xmm4,%xmm2
1440         orq     %r8,%r12
1441         orq     %r9,%r12
1442         por     %xmm5,%xmm2
1443 .byte   102,73,15,110,220
1444
1445         movq    384(%rsp),%rax
1446         leaq    384(%rsp),%rbx
1447         movq    0+96(%rsp),%r9
1448         movq    8+96(%rsp),%r10
1449         leaq    0+96(%rsp),%rsi
1450         movq    16+96(%rsp),%r11
1451         movq    24+96(%rsp),%r12
1452         leaq    160(%rsp),%rdi
1453         call    __ecp_nistz256_mul_montq
1454
1455         movq    480(%rsp),%rax
1456         leaq    480(%rsp),%rbx
1457         movq    0+32(%rsp),%r9
1458         movq    8+32(%rsp),%r10
1459         leaq    0+32(%rsp),%rsi
1460         movq    16+32(%rsp),%r11
1461         movq    24+32(%rsp),%r12
1462         leaq    192(%rsp),%rdi
1463         call    __ecp_nistz256_mul_montq
1464
1465         leaq    160(%rsp),%rbx
1466         leaq    0(%rsp),%rdi
1467         call    __ecp_nistz256_sub_fromq
1468
1469         orq     %r13,%r12
1470         orq     %r8,%r12
1471         orq     %r9,%r12
1472
1473 .byte   0x3e
1474         jnz     .Ladd_proceedq
1475 .byte   102,73,15,126,208
1476 .byte   102,73,15,126,217
1477         testq   %r8,%r8
1478         jnz     .Ladd_proceedq
1479         testq   %r9,%r9
1480         jz      .Ladd_doubleq
1481
1482 .byte   102,72,15,126,199
1483         pxor    %xmm0,%xmm0
1484         movdqu  %xmm0,0(%rdi)
1485         movdqu  %xmm0,16(%rdi)
1486         movdqu  %xmm0,32(%rdi)
1487         movdqu  %xmm0,48(%rdi)
1488         movdqu  %xmm0,64(%rdi)
1489         movdqu  %xmm0,80(%rdi)
1490         jmp     .Ladd_doneq
1491
1492 .align  32
1493 .Ladd_doubleq:
1494 .byte   102,72,15,126,206
1495 .byte   102,72,15,126,199
1496         addq    $416,%rsp
1497         jmp     .Lpoint_double_shortcutq
1498
1499 .align  32
1500 .Ladd_proceedq:
1501         movq    0+64(%rsp),%rax
1502         movq    8+64(%rsp),%r14
1503         leaq    0+64(%rsp),%rsi
1504         movq    16+64(%rsp),%r15
1505         movq    24+64(%rsp),%r8
1506         leaq    96(%rsp),%rdi
1507         call    __ecp_nistz256_sqr_montq
1508
1509         movq    448(%rsp),%rax
1510         leaq    448(%rsp),%rbx
1511         movq    0+0(%rsp),%r9
1512         movq    8+0(%rsp),%r10
1513         leaq    0+0(%rsp),%rsi
1514         movq    16+0(%rsp),%r11
1515         movq    24+0(%rsp),%r12
1516         leaq    352(%rsp),%rdi
1517         call    __ecp_nistz256_mul_montq
1518
1519         movq    0+0(%rsp),%rax
1520         movq    8+0(%rsp),%r14
1521         leaq    0+0(%rsp),%rsi
1522         movq    16+0(%rsp),%r15
1523         movq    24+0(%rsp),%r8
1524         leaq    32(%rsp),%rdi
1525         call    __ecp_nistz256_sqr_montq
1526
1527         movq    544(%rsp),%rax
1528         leaq    544(%rsp),%rbx
1529         movq    0+352(%rsp),%r9
1530         movq    8+352(%rsp),%r10
1531         leaq    0+352(%rsp),%rsi
1532         movq    16+352(%rsp),%r11
1533         movq    24+352(%rsp),%r12
1534         leaq    352(%rsp),%rdi
1535         call    __ecp_nistz256_mul_montq
1536
1537         movq    0(%rsp),%rax
1538         leaq    0(%rsp),%rbx
1539         movq    0+32(%rsp),%r9
1540         movq    8+32(%rsp),%r10
1541         leaq    0+32(%rsp),%rsi
1542         movq    16+32(%rsp),%r11
1543         movq    24+32(%rsp),%r12
1544         leaq    128(%rsp),%rdi
1545         call    __ecp_nistz256_mul_montq
1546
1547         movq    160(%rsp),%rax
1548         leaq    160(%rsp),%rbx
1549         movq    0+32(%rsp),%r9
1550         movq    8+32(%rsp),%r10
1551         leaq    0+32(%rsp),%rsi
1552         movq    16+32(%rsp),%r11
1553         movq    24+32(%rsp),%r12
1554         leaq    192(%rsp),%rdi
1555         call    __ecp_nistz256_mul_montq
1556
1557
1558
1559
1560         addq    %r12,%r12
1561         leaq    96(%rsp),%rsi
1562         adcq    %r13,%r13
1563         movq    %r12,%rax
1564         adcq    %r8,%r8
1565         adcq    %r9,%r9
1566         movq    %r13,%rbp
1567         sbbq    %r11,%r11
1568
1569         subq    $-1,%r12
1570         movq    %r8,%rcx
1571         sbbq    %r14,%r13
1572         sbbq    $0,%r8
1573         movq    %r9,%r10
1574         sbbq    %r15,%r9
1575         testq   %r11,%r11
1576
1577         cmovzq  %rax,%r12
1578         movq    0(%rsi),%rax
1579         cmovzq  %rbp,%r13
1580         movq    8(%rsi),%rbp
1581         cmovzq  %rcx,%r8
1582         movq    16(%rsi),%rcx
1583         cmovzq  %r10,%r9
1584         movq    24(%rsi),%r10
1585
1586         call    __ecp_nistz256_subq
1587
1588         leaq    128(%rsp),%rbx
1589         leaq    288(%rsp),%rdi
1590         call    __ecp_nistz256_sub_fromq
1591
1592         movq    192+0(%rsp),%rax
1593         movq    192+8(%rsp),%rbp
1594         movq    192+16(%rsp),%rcx
1595         movq    192+24(%rsp),%r10
1596         leaq    320(%rsp),%rdi
1597
1598         call    __ecp_nistz256_subq
1599
1600         movq    %r12,0(%rdi)
1601         movq    %r13,8(%rdi)
1602         movq    %r8,16(%rdi)
1603         movq    %r9,24(%rdi)
1604         movq    128(%rsp),%rax
1605         leaq    128(%rsp),%rbx
1606         movq    0+224(%rsp),%r9
1607         movq    8+224(%rsp),%r10
1608         leaq    0+224(%rsp),%rsi
1609         movq    16+224(%rsp),%r11
1610         movq    24+224(%rsp),%r12
1611         leaq    256(%rsp),%rdi
1612         call    __ecp_nistz256_mul_montq
1613
1614         movq    320(%rsp),%rax
1615         leaq    320(%rsp),%rbx
1616         movq    0+64(%rsp),%r9
1617         movq    8+64(%rsp),%r10
1618         leaq    0+64(%rsp),%rsi
1619         movq    16+64(%rsp),%r11
1620         movq    24+64(%rsp),%r12
1621         leaq    320(%rsp),%rdi
1622         call    __ecp_nistz256_mul_montq
1623
1624         leaq    256(%rsp),%rbx
1625         leaq    320(%rsp),%rdi
1626         call    __ecp_nistz256_sub_fromq
1627
1628 .byte   102,72,15,126,199
1629
1630         movdqa  %xmm5,%xmm0
1631         movdqa  %xmm5,%xmm1
1632         pandn   352(%rsp),%xmm0
1633         movdqa  %xmm5,%xmm2
1634         pandn   352+16(%rsp),%xmm1
1635         movdqa  %xmm5,%xmm3
1636         pand    544(%rsp),%xmm2
1637         pand    544+16(%rsp),%xmm3
1638         por     %xmm0,%xmm2
1639         por     %xmm1,%xmm3
1640
1641         movdqa  %xmm4,%xmm0
1642         movdqa  %xmm4,%xmm1
1643         pandn   %xmm2,%xmm0
1644         movdqa  %xmm4,%xmm2
1645         pandn   %xmm3,%xmm1
1646         movdqa  %xmm4,%xmm3
1647         pand    448(%rsp),%xmm2
1648         pand    448+16(%rsp),%xmm3
1649         por     %xmm0,%xmm2
1650         por     %xmm1,%xmm3
1651         movdqu  %xmm2,64(%rdi)
1652         movdqu  %xmm3,80(%rdi)
1653
1654         movdqa  %xmm5,%xmm0
1655         movdqa  %xmm5,%xmm1
1656         pandn   288(%rsp),%xmm0
1657         movdqa  %xmm5,%xmm2
1658         pandn   288+16(%rsp),%xmm1
1659         movdqa  %xmm5,%xmm3
1660         pand    480(%rsp),%xmm2
1661         pand    480+16(%rsp),%xmm3
1662         por     %xmm0,%xmm2
1663         por     %xmm1,%xmm3
1664
1665         movdqa  %xmm4,%xmm0
1666         movdqa  %xmm4,%xmm1
1667         pandn   %xmm2,%xmm0
1668         movdqa  %xmm4,%xmm2
1669         pandn   %xmm3,%xmm1
1670         movdqa  %xmm4,%xmm3
1671         pand    384(%rsp),%xmm2
1672         pand    384+16(%rsp),%xmm3
1673         por     %xmm0,%xmm2
1674         por     %xmm1,%xmm3
1675         movdqu  %xmm2,0(%rdi)
1676         movdqu  %xmm3,16(%rdi)
1677
1678         movdqa  %xmm5,%xmm0
1679         movdqa  %xmm5,%xmm1
1680         pandn   320(%rsp),%xmm0
1681         movdqa  %xmm5,%xmm2
1682         pandn   320+16(%rsp),%xmm1
1683         movdqa  %xmm5,%xmm3
1684         pand    512(%rsp),%xmm2
1685         pand    512+16(%rsp),%xmm3
1686         por     %xmm0,%xmm2
1687         por     %xmm1,%xmm3
1688
1689         movdqa  %xmm4,%xmm0
1690         movdqa  %xmm4,%xmm1
1691         pandn   %xmm2,%xmm0
1692         movdqa  %xmm4,%xmm2
1693         pandn   %xmm3,%xmm1
1694         movdqa  %xmm4,%xmm3
1695         pand    416(%rsp),%xmm2
1696         pand    416+16(%rsp),%xmm3
1697         por     %xmm0,%xmm2
1698         por     %xmm1,%xmm3
1699         movdqu  %xmm2,32(%rdi)
1700         movdqu  %xmm3,48(%rdi)
1701
1702 .Ladd_doneq:
1703         addq    $576+8,%rsp
1704         popq    %r15
1705         popq    %r14
1706         popq    %r13
1707         popq    %r12
1708         popq    %rbx
1709         popq    %rbp
1710         .byte   0xf3,0xc3
1711 .size   ecp_nistz256_point_add,.-ecp_nistz256_point_add
1712 .globl  ecp_nistz256_point_add_affine
1713 .type   ecp_nistz256_point_add_affine,@function
1714 .align  32
1715 ecp_nistz256_point_add_affine:
1716         pushq   %rbp
1717         pushq   %rbx
1718         pushq   %r12
1719         pushq   %r13
1720         pushq   %r14
1721         pushq   %r15
1722         subq    $480+8,%rsp
1723
1724         movdqu  0(%rsi),%xmm0
1725         movq    %rdx,%rbx
1726         movdqu  16(%rsi),%xmm1
1727         movdqu  32(%rsi),%xmm2
1728         movdqu  48(%rsi),%xmm3
1729         movdqu  64(%rsi),%xmm4
1730         movdqu  80(%rsi),%xmm5
1731         movq    64+0(%rsi),%rax
1732         movq    64+8(%rsi),%r14
1733         movq    64+16(%rsi),%r15
1734         movq    64+24(%rsi),%r8
1735         movdqa  %xmm0,320(%rsp)
1736         movdqa  %xmm1,320+16(%rsp)
1737         por     %xmm0,%xmm1
1738         movdqa  %xmm2,352(%rsp)
1739         movdqa  %xmm3,352+16(%rsp)
1740         por     %xmm2,%xmm3
1741         movdqa  %xmm4,384(%rsp)
1742         movdqa  %xmm5,384+16(%rsp)
1743         por     %xmm1,%xmm3
1744
1745         movdqu  0(%rbx),%xmm0
1746         pshufd  $0xb1,%xmm3,%xmm5
1747         movdqu  16(%rbx),%xmm1
1748         movdqu  32(%rbx),%xmm2
1749         por     %xmm3,%xmm5
1750         movdqu  48(%rbx),%xmm3
1751         movdqa  %xmm0,416(%rsp)
1752         pshufd  $0x1e,%xmm5,%xmm4
1753         movdqa  %xmm1,416+16(%rsp)
1754         por     %xmm0,%xmm1
1755 .byte   102,72,15,110,199
1756         movdqa  %xmm2,448(%rsp)
1757         movdqa  %xmm3,448+16(%rsp)
1758         por     %xmm2,%xmm3
1759         por     %xmm4,%xmm5
1760         pxor    %xmm4,%xmm4
1761         por     %xmm1,%xmm3
1762
1763         leaq    64-0(%rsi),%rsi
1764         leaq    32(%rsp),%rdi
1765         call    __ecp_nistz256_sqr_montq
1766
1767         pcmpeqd %xmm4,%xmm5
1768         pshufd  $0xb1,%xmm3,%xmm4
1769         movq    0(%rbx),%rax
1770
1771         movq    %r12,%r9
1772         por     %xmm3,%xmm4
1773         pshufd  $0,%xmm5,%xmm5
1774         pshufd  $0x1e,%xmm4,%xmm3
1775         movq    %r13,%r10
1776         por     %xmm3,%xmm4
1777         pxor    %xmm3,%xmm3
1778         movq    %r14,%r11
1779         pcmpeqd %xmm3,%xmm4
1780         pshufd  $0,%xmm4,%xmm4
1781
1782         leaq    32-0(%rsp),%rsi
1783         movq    %r15,%r12
1784         leaq    0(%rsp),%rdi
1785         call    __ecp_nistz256_mul_montq
1786
1787         leaq    320(%rsp),%rbx
1788         leaq    64(%rsp),%rdi
1789         call    __ecp_nistz256_sub_fromq
1790
1791         movq    384(%rsp),%rax
1792         leaq    384(%rsp),%rbx
1793         movq    0+32(%rsp),%r9
1794         movq    8+32(%rsp),%r10
1795         leaq    0+32(%rsp),%rsi
1796         movq    16+32(%rsp),%r11
1797         movq    24+32(%rsp),%r12
1798         leaq    32(%rsp),%rdi
1799         call    __ecp_nistz256_mul_montq
1800
1801         movq    384(%rsp),%rax
1802         leaq    384(%rsp),%rbx
1803         movq    0+64(%rsp),%r9
1804         movq    8+64(%rsp),%r10
1805         leaq    0+64(%rsp),%rsi
1806         movq    16+64(%rsp),%r11
1807         movq    24+64(%rsp),%r12
1808         leaq    288(%rsp),%rdi
1809         call    __ecp_nistz256_mul_montq
1810
1811         movq    448(%rsp),%rax
1812         leaq    448(%rsp),%rbx
1813         movq    0+32(%rsp),%r9
1814         movq    8+32(%rsp),%r10
1815         leaq    0+32(%rsp),%rsi
1816         movq    16+32(%rsp),%r11
1817         movq    24+32(%rsp),%r12
1818         leaq    32(%rsp),%rdi
1819         call    __ecp_nistz256_mul_montq
1820
1821         leaq    352(%rsp),%rbx
1822         leaq    96(%rsp),%rdi
1823         call    __ecp_nistz256_sub_fromq
1824
1825         movq    0+64(%rsp),%rax
1826         movq    8+64(%rsp),%r14
1827         leaq    0+64(%rsp),%rsi
1828         movq    16+64(%rsp),%r15
1829         movq    24+64(%rsp),%r8
1830         leaq    128(%rsp),%rdi
1831         call    __ecp_nistz256_sqr_montq
1832
1833         movq    0+96(%rsp),%rax
1834         movq    8+96(%rsp),%r14
1835         leaq    0+96(%rsp),%rsi
1836         movq    16+96(%rsp),%r15
1837         movq    24+96(%rsp),%r8
1838         leaq    192(%rsp),%rdi
1839         call    __ecp_nistz256_sqr_montq
1840
1841         movq    128(%rsp),%rax
1842         leaq    128(%rsp),%rbx
1843         movq    0+64(%rsp),%r9
1844         movq    8+64(%rsp),%r10
1845         leaq    0+64(%rsp),%rsi
1846         movq    16+64(%rsp),%r11
1847         movq    24+64(%rsp),%r12
1848         leaq    160(%rsp),%rdi
1849         call    __ecp_nistz256_mul_montq
1850
1851         movq    320(%rsp),%rax
1852         leaq    320(%rsp),%rbx
1853         movq    0+128(%rsp),%r9
1854         movq    8+128(%rsp),%r10
1855         leaq    0+128(%rsp),%rsi
1856         movq    16+128(%rsp),%r11
1857         movq    24+128(%rsp),%r12
1858         leaq    0(%rsp),%rdi
1859         call    __ecp_nistz256_mul_montq
1860
1861
1862
1863
1864         addq    %r12,%r12
1865         leaq    192(%rsp),%rsi
1866         adcq    %r13,%r13
1867         movq    %r12,%rax
1868         adcq    %r8,%r8
1869         adcq    %r9,%r9
1870         movq    %r13,%rbp
1871         sbbq    %r11,%r11
1872
1873         subq    $-1,%r12
1874         movq    %r8,%rcx
1875         sbbq    %r14,%r13
1876         sbbq    $0,%r8
1877         movq    %r9,%r10
1878         sbbq    %r15,%r9
1879         testq   %r11,%r11
1880
1881         cmovzq  %rax,%r12
1882         movq    0(%rsi),%rax
1883         cmovzq  %rbp,%r13
1884         movq    8(%rsi),%rbp
1885         cmovzq  %rcx,%r8
1886         movq    16(%rsi),%rcx
1887         cmovzq  %r10,%r9
1888         movq    24(%rsi),%r10
1889
1890         call    __ecp_nistz256_subq
1891
1892         leaq    160(%rsp),%rbx
1893         leaq    224(%rsp),%rdi
1894         call    __ecp_nistz256_sub_fromq
1895
1896         movq    0+0(%rsp),%rax
1897         movq    0+8(%rsp),%rbp
1898         movq    0+16(%rsp),%rcx
1899         movq    0+24(%rsp),%r10
1900         leaq    64(%rsp),%rdi
1901
1902         call    __ecp_nistz256_subq
1903
1904         movq    %r12,0(%rdi)
1905         movq    %r13,8(%rdi)
1906         movq    %r8,16(%rdi)
1907         movq    %r9,24(%rdi)
1908         movq    352(%rsp),%rax
1909         leaq    352(%rsp),%rbx
1910         movq    0+160(%rsp),%r9
1911         movq    8+160(%rsp),%r10
1912         leaq    0+160(%rsp),%rsi
1913         movq    16+160(%rsp),%r11
1914         movq    24+160(%rsp),%r12
1915         leaq    32(%rsp),%rdi
1916         call    __ecp_nistz256_mul_montq
1917
1918         movq    96(%rsp),%rax
1919         leaq    96(%rsp),%rbx
1920         movq    0+64(%rsp),%r9
1921         movq    8+64(%rsp),%r10
1922         leaq    0+64(%rsp),%rsi
1923         movq    16+64(%rsp),%r11
1924         movq    24+64(%rsp),%r12
1925         leaq    64(%rsp),%rdi
1926         call    __ecp_nistz256_mul_montq
1927
1928         leaq    32(%rsp),%rbx
1929         leaq    256(%rsp),%rdi
1930         call    __ecp_nistz256_sub_fromq
1931
1932 .byte   102,72,15,126,199
1933
1934         movdqa  %xmm5,%xmm0
1935         movdqa  %xmm5,%xmm1
1936         pandn   288(%rsp),%xmm0
1937         movdqa  %xmm5,%xmm2
1938         pandn   288+16(%rsp),%xmm1
1939         movdqa  %xmm5,%xmm3
1940         pand    .LONE_mont(%rip),%xmm2
1941         pand    .LONE_mont+16(%rip),%xmm3
1942         por     %xmm0,%xmm2
1943         por     %xmm1,%xmm3
1944
1945         movdqa  %xmm4,%xmm0
1946         movdqa  %xmm4,%xmm1
1947         pandn   %xmm2,%xmm0
1948         movdqa  %xmm4,%xmm2
1949         pandn   %xmm3,%xmm1
1950         movdqa  %xmm4,%xmm3
1951         pand    384(%rsp),%xmm2
1952         pand    384+16(%rsp),%xmm3
1953         por     %xmm0,%xmm2
1954         por     %xmm1,%xmm3
1955         movdqu  %xmm2,64(%rdi)
1956         movdqu  %xmm3,80(%rdi)
1957
1958         movdqa  %xmm5,%xmm0
1959         movdqa  %xmm5,%xmm1
1960         pandn   224(%rsp),%xmm0
1961         movdqa  %xmm5,%xmm2
1962         pandn   224+16(%rsp),%xmm1
1963         movdqa  %xmm5,%xmm3
1964         pand    416(%rsp),%xmm2
1965         pand    416+16(%rsp),%xmm3
1966         por     %xmm0,%xmm2
1967         por     %xmm1,%xmm3
1968
1969         movdqa  %xmm4,%xmm0
1970         movdqa  %xmm4,%xmm1
1971         pandn   %xmm2,%xmm0
1972         movdqa  %xmm4,%xmm2
1973         pandn   %xmm3,%xmm1
1974         movdqa  %xmm4,%xmm3
1975         pand    320(%rsp),%xmm2
1976         pand    320+16(%rsp),%xmm3
1977         por     %xmm0,%xmm2
1978         por     %xmm1,%xmm3
1979         movdqu  %xmm2,0(%rdi)
1980         movdqu  %xmm3,16(%rdi)
1981
1982         movdqa  %xmm5,%xmm0
1983         movdqa  %xmm5,%xmm1
1984         pandn   256(%rsp),%xmm0
1985         movdqa  %xmm5,%xmm2
1986         pandn   256+16(%rsp),%xmm1
1987         movdqa  %xmm5,%xmm3
1988         pand    448(%rsp),%xmm2
1989         pand    448+16(%rsp),%xmm3
1990         por     %xmm0,%xmm2
1991         por     %xmm1,%xmm3
1992
1993         movdqa  %xmm4,%xmm0
1994         movdqa  %xmm4,%xmm1
1995         pandn   %xmm2,%xmm0
1996         movdqa  %xmm4,%xmm2
1997         pandn   %xmm3,%xmm1
1998         movdqa  %xmm4,%xmm3
1999         pand    352(%rsp),%xmm2
2000         pand    352+16(%rsp),%xmm3
2001         por     %xmm0,%xmm2
2002         por     %xmm1,%xmm3
2003         movdqu  %xmm2,32(%rdi)
2004         movdqu  %xmm3,48(%rdi)
2005
2006         addq    $480+8,%rsp
2007         popq    %r15
2008         popq    %r14
2009         popq    %r13
2010         popq    %r12
2011         popq    %rbx
2012         popq    %rbp
2013         .byte   0xf3,0xc3
2014 .size   ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine