6 .globl bn_mul_mont_gather5
7 .type bn_mul_mont_gather5,@function
29 leaq -264(%rsp,%r11,8),%rsp
32 movq %rax,8(%rsp,%r9,8)
37 leaq 24-112(%rsp,%r9,8),%r10
49 movdqa %xmm0,112(%r10)
54 movdqa %xmm1,128(%r10)
59 movdqa %xmm2,144(%r10)
64 movdqa %xmm3,160(%r10)
68 movdqa %xmm0,176(%r10)
73 movdqa %xmm1,192(%r10)
78 movdqa %xmm2,208(%r10)
83 movdqa %xmm3,224(%r10)
87 movdqa %xmm0,240(%r10)
92 movdqa %xmm1,256(%r10)
97 movdqa %xmm2,272(%r10)
102 movdqa %xmm3,288(%r10)
106 movdqa %xmm0,304(%r10)
111 movdqa %xmm1,320(%r10)
114 movdqa %xmm2,336(%r10)
119 movdqa %xmm3,352(%r10)
123 movdqa -128(%r12),%xmm4
124 movdqa -112(%r12),%xmm5
125 movdqa -96(%r12),%xmm2
127 movdqa -80(%r12),%xmm3
135 movdqa -64(%r12),%xmm4
136 movdqa -48(%r12),%xmm5
137 movdqa -32(%r12),%xmm2
139 movdqa -16(%r12),%xmm3
148 movdqa 16(%r12),%xmm5
149 movdqa 32(%r12),%xmm2
151 movdqa 48(%r12),%xmm3
160 pshufd $0x4e,%xmm0,%xmm1
163 .byte 102,72,15,126,195
191 movq (%rsi,%r15,8),%rax
196 movq %r13,-16(%rsp,%r15,8)
202 movq (%rcx,%r15,8),%rax
216 movq %r13,-16(%rsp,%r9,8)
223 movq %r13,-8(%rsp,%r9,8)
224 movq %rdx,(%rsp,%r9,8)
230 leaq 24+128(%rsp,%r9,8),%rdx
234 movdqa -128(%r12),%xmm0
235 movdqa -112(%r12),%xmm1
236 movdqa -96(%r12),%xmm2
237 movdqa -80(%r12),%xmm3
238 pand -128(%rdx),%xmm0
239 pand -112(%rdx),%xmm1
246 movdqa -64(%r12),%xmm0
247 movdqa -48(%r12),%xmm1
248 movdqa -32(%r12),%xmm2
249 movdqa -16(%r12),%xmm3
259 movdqa 16(%r12),%xmm1
260 movdqa 32(%r12),%xmm2
261 movdqa 48(%r12),%xmm3
270 movdqa 64(%r12),%xmm0
271 movdqa 80(%r12),%xmm1
272 movdqa 96(%r12),%xmm2
273 movdqa 112(%r12),%xmm3
283 pshufd $0x4e,%xmm4,%xmm0
288 .byte 102,72,15,126,195
315 movq (%rsi,%r15,8),%rax
318 movq (%rsp,%r15,8),%r10
320 movq %r13,-16(%rsp,%r15,8)
326 movq (%rcx,%r15,8),%rax
340 movq (%rsp,%r9,8),%r10
342 movq %r13,-16(%rsp,%r9,8)
350 movq %r13,-8(%rsp,%r9,8)
351 movq %rdx,(%rsp,%r9,8)
363 .Lsub: sbbq (%rcx,%r14,8),%rax
364 movq %rax,(%rdi,%r14,8)
365 movq 8(%rsi,%r14,8),%rax
380 movq (%rsi,%r14,8),%rax
381 movq %r14,(%rsp,%r14,8)
382 movq %rax,(%rdi,%r14,8)
387 movq 8(%rsp,%r9,8),%rsi
399 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
400 .type bn_mul4x_mont_gather5,@function
402 bn_mul4x_mont_gather5:
415 leaq (%r9,%r9,2),%r10
427 leaq -320(%rsp,%r9,2),%r11
433 leaq -320(%rsp,%r9,2),%rsp
438 leaq 4096-320(,%r9,2),%r10
439 leaq -320(%rsp,%r9,2),%rsp
465 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
467 .type mul4x_internal,@function
472 leaq .Linc(%rip),%rax
473 leaq 128(%rdx,%r9,1),%r13
476 movdqa 16(%rax),%xmm1
477 leaq 88-112(%rsp,%r9,1),%r10
480 pshufd $0,%xmm5,%xmm5
490 movdqa %xmm0,112(%r10)
495 movdqa %xmm1,128(%r10)
500 movdqa %xmm2,144(%r10)
505 movdqa %xmm3,160(%r10)
509 movdqa %xmm0,176(%r10)
514 movdqa %xmm1,192(%r10)
519 movdqa %xmm2,208(%r10)
524 movdqa %xmm3,224(%r10)
528 movdqa %xmm0,240(%r10)
533 movdqa %xmm1,256(%r10)
538 movdqa %xmm2,272(%r10)
543 movdqa %xmm3,288(%r10)
547 movdqa %xmm0,304(%r10)
552 movdqa %xmm1,320(%r10)
555 movdqa %xmm2,336(%r10)
560 movdqa %xmm3,352(%r10)
564 movdqa -128(%r12),%xmm4
565 movdqa -112(%r12),%xmm5
566 movdqa -96(%r12),%xmm2
568 movdqa -80(%r12),%xmm3
576 movdqa -64(%r12),%xmm4
577 movdqa -48(%r12),%xmm5
578 movdqa -32(%r12),%xmm2
580 movdqa -16(%r12),%xmm3
589 movdqa 16(%r12),%xmm5
590 movdqa 32(%r12),%xmm2
592 movdqa 48(%r12),%xmm3
601 pshufd $0x4e,%xmm0,%xmm1
604 .byte 102,72,15,126,195
611 leaq (%rsi,%r9,1),%rsi
625 movq 8(%rsi,%r9,1),%rax
637 movq 16(%rsi,%r9,1),%rax
658 movq -8(%rsi,%r15,1),%rax
673 movq (%rsi,%r15,1),%rax
688 movq 8(%rsi,%r15,1),%rax
703 movq 16(%rsi,%r15,1),%rax
738 movq (%rsi,%r9,1),%rax
745 leaq (%rcx,%r9,1),%rcx
756 leaq 16+128(%r14),%rdx
759 movdqa -128(%r12),%xmm0
760 movdqa -112(%r12),%xmm1
761 movdqa -96(%r12),%xmm2
762 movdqa -80(%r12),%xmm3
763 pand -128(%rdx),%xmm0
764 pand -112(%rdx),%xmm1
771 movdqa -64(%r12),%xmm0
772 movdqa -48(%r12),%xmm1
773 movdqa -32(%r12),%xmm2
774 movdqa -16(%r12),%xmm3
784 movdqa 16(%r12),%xmm1
785 movdqa 32(%r12),%xmm2
786 movdqa 48(%r12),%xmm3
795 movdqa 64(%r12),%xmm0
796 movdqa 80(%r12),%xmm1
797 movdqa 96(%r12),%xmm2
798 movdqa 112(%r12),%xmm3
808 pshufd $0x4e,%xmm4,%xmm0
811 .byte 102,72,15,126,195
813 movq (%r14,%r9,1),%r10
824 leaq (%r14,%r9,1),%r14
828 movq 8(%rsi,%r9,1),%rax
842 movq 16(%rsi,%r9,1),%rax
864 movq -8(%rsi,%r15,1),%rax
881 movq (%rsi,%r15,1),%rax
898 movq 8(%rsi,%r15,1),%rax
915 movq 16(%rsi,%r15,1),%rax
955 movq (%rsi,%r9,1),%rax
963 leaq (%rcx,%r9,1),%rcx
979 leaq (%r14,%r9,1),%rbx
990 jmp .Lsqr4x_sub_entry
991 .size mul4x_internal,.-mul4x_internal
993 .type bn_power5,@function
1005 leal (%r9,%r9,2),%r10d
1016 leaq -320(%rsp,%r9,2),%r11
1022 leaq -320(%rsp,%r9,2),%rsp
1027 leaq 4096-320(,%r9,2),%r10
1028 leaq -320(%rsp,%r9,2),%rsp
1050 .byte 102,72,15,110,207
1051 .byte 102,72,15,110,209
1052 .byte 102,73,15,110,218
1053 .byte 102,72,15,110,226
1055 call __bn_sqr8x_internal
1056 call __bn_post4x_internal
1057 call __bn_sqr8x_internal
1058 call __bn_post4x_internal
1059 call __bn_sqr8x_internal
1060 call __bn_post4x_internal
1061 call __bn_sqr8x_internal
1062 call __bn_post4x_internal
1063 call __bn_sqr8x_internal
1064 call __bn_post4x_internal
1066 .byte 102,72,15,126,209
1067 .byte 102,72,15,126,226
1085 .size bn_power5,.-bn_power5
1087 .globl bn_sqr8x_internal
1088 .hidden bn_sqr8x_internal
1089 .type bn_sqr8x_internal,@function
1092 __bn_sqr8x_internal:
1167 leaq (%rsi,%r9,1),%rsi
1172 movq -32(%rsi,%rbp,1),%r14
1173 leaq 48+8(%rsp,%r9,2),%rdi
1174 movq -24(%rsi,%rbp,1),%rax
1175 leaq -32(%rdi,%rbp,1),%rdi
1176 movq -16(%rsi,%rbp,1),%rbx
1183 movq %r10,-24(%rdi,%rbp,1)
1189 movq %r11,-16(%rdi,%rbp,1)
1193 movq -8(%rsi,%rbp,1),%rbx
1207 movq %r10,-8(%rdi,%rcx,1)
1212 movq (%rsi,%rcx,1),%rbx
1222 movq 8(%rsi,%rcx,1),%rbx
1232 movq %r11,(%rdi,%rcx,1)
1239 movq 16(%rsi,%rcx,1),%rbx
1248 movq %r10,8(%rdi,%rcx,1)
1255 movq 24(%rsi,%rcx,1),%rbx
1265 movq %r11,16(%rdi,%rcx,1)
1277 movq %r10,-8(%rdi,%rcx,1)
1296 movq -32(%rsi,%rbp,1),%r14
1297 leaq 48+8(%rsp,%r9,2),%rdi
1298 movq -24(%rsi,%rbp,1),%rax
1299 leaq -32(%rdi,%rbp,1),%rdi
1300 movq -16(%rsi,%rbp,1),%rbx
1304 movq -24(%rdi,%rbp,1),%r10
1308 movq %r10,-24(%rdi,%rbp,1)
1315 addq -16(%rdi,%rbp,1),%r11
1318 movq %r11,-16(%rdi,%rbp,1)
1322 movq -8(%rsi,%rbp,1),%rbx
1327 addq -8(%rdi,%rbp,1),%r12
1338 movq %r10,-8(%rdi,%rbp,1)
1345 movq (%rsi,%rcx,1),%rbx
1351 addq (%rdi,%rcx,1),%r13
1358 movq 8(%rsi,%rcx,1),%rbx
1366 movq %r11,(%rdi,%rcx,1)
1370 addq 8(%rdi,%rcx,1),%r12
1381 movq %r10,-8(%rdi,%rcx,1)
1402 leaq 48+8(%rsp,%r9,2),%rdi
1404 leaq -32(%rdi,%rbp,1),%rdi
1463 movq -16(%rsi,%rbp,1),%rax
1464 leaq 48+8(%rsp),%rdi
1468 leaq (%r14,%r10,2),%r12
1470 leaq (%rcx,%r11,2),%r13
1479 movq -8(%rsi,%rbp,1),%rax
1483 leaq (%r14,%r10,2),%rbx
1487 leaq (%rcx,%r11,2),%r8
1496 movq 0(%rsi,%rbp,1),%rax
1503 jmp .Lsqr4x_shift_n_add
1506 .Lsqr4x_shift_n_add:
1507 leaq (%r14,%r10,2),%r12
1509 leaq (%rcx,%r11,2),%r13
1518 movq -8(%rsi,%rbp,1),%rax
1522 leaq (%r14,%r10,2),%rbx
1526 leaq (%rcx,%r11,2),%r8
1535 movq 0(%rsi,%rbp,1),%rax
1539 leaq (%r14,%r10,2),%r12
1543 leaq (%rcx,%r11,2),%r13
1552 movq 8(%rsi,%rbp,1),%rax
1556 leaq (%r14,%r10,2),%rbx
1560 leaq (%rcx,%r11,2),%r8
1569 movq 16(%rsi,%rbp,1),%rax
1576 jnz .Lsqr4x_shift_n_add
1578 leaq (%r14,%r10,2),%r12
1581 leaq (%rcx,%r11,2),%r13
1594 leaq (%r14,%r10,2),%rbx
1598 leaq (%rcx,%r11,2),%r8
1607 .byte 102,72,15,126,213
1608 __bn_sqr8x_reduction:
1610 leaq (%r9,%rbp,1),%rcx
1611 leaq 48+8(%rsp,%r9,2),%rdx
1613 leaq 48+8(%rsp,%r9,1),%rdi
1616 jmp .L8x_reduction_loop
1619 .L8x_reduction_loop:
1620 leaq (%rdi,%r9,1),%rdi
1635 imulq 32+8(%rsp),%rbx
1653 movq %rbx,48-8+8(%rsp,%rcx,8)
1662 movq 32+8(%rsp),%rsi
1728 movq 48+56+8(%rsp),%rbx
1792 movq 48-16+8(%rsp,%rcx,8),%rbx
1808 movq 48+56+8(%rsp),%rbx
1852 .byte 102,72,15,126,213
1856 .byte 102,73,15,126,217
1866 jb .L8x_reduction_loop
1868 .size bn_sqr8x_internal,.-bn_sqr8x_internal
1869 .type __bn_post4x_internal,@function
1871 __bn_post4x_internal:
1873 leaq (%rdi,%r9,1),%rbx
1875 .byte 102,72,15,126,207
1877 .byte 102,72,15,126,206
1884 jmp .Lsqr4x_sub_entry
1922 .size __bn_post4x_internal,.-__bn_post4x_internal
1923 .globl bn_from_montgomery
1924 .type bn_from_montgomery,@function
1931 .size bn_from_montgomery,.-bn_from_montgomery
1933 .type bn_from_mont8x,@function
1946 leaq (%r9,%r9,2),%r10
1957 leaq -320(%rsp,%r9,2),%r11
1963 leaq -320(%rsp,%r9,2),%rsp
1968 leaq 4096-320(,%r9,2),%r10
1969 leaq -320(%rsp,%r9,2),%rsp
1999 movdqu 16(%rsi),%xmm2
2000 movdqu 32(%rsi),%xmm3
2001 movdqa %xmm0,(%rax,%r9,1)
2002 movdqu 48(%rsi),%xmm4
2003 movdqa %xmm0,16(%rax,%r9,1)
2004 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2006 movdqa %xmm0,32(%rax,%r9,1)
2007 movdqa %xmm2,16(%rax)
2008 movdqa %xmm0,48(%rax,%r9,1)
2009 movdqa %xmm3,32(%rax)
2010 movdqa %xmm4,48(%rax)
2015 .byte 102,72,15,110,207
2016 .byte 102,72,15,110,209
2019 .byte 102,73,15,110,218
2020 call __bn_sqr8x_reduction
2021 call __bn_post4x_internal
2026 jmp .Lfrom_mont_zero
2030 movdqa %xmm0,0(%rax)
2031 movdqa %xmm0,16(%rax)
2032 movdqa %xmm0,32(%rax)
2033 movdqa %xmm0,48(%rax)
2036 jnz .Lfrom_mont_zero
2048 .size bn_from_mont8x,.-bn_from_mont8x
2050 .type bn_get_bits5,@function
2062 movzwl (%r10,%rsi,2),%eax
2066 .size bn_get_bits5,.-bn_get_bits5
2069 .type bn_scatter5,@function
2073 jz .Lscatter_epilogue
2074 leaq (%rdx,%rcx,8),%rdx
2084 .size bn_scatter5,.-bn_scatter5
2087 .type bn_gather5,@function
2090 .LSEH_begin_bn_gather5:
2092 .byte 0x4c,0x8d,0x14,0x24
2093 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
2094 leaq .Linc(%rip),%rax
2098 movdqa 0(%rax),%xmm0
2099 movdqa 16(%rax),%xmm1
2103 pshufd $0,%xmm5,%xmm5
2112 movdqa %xmm0,-128(%rax)
2117 movdqa %xmm1,-112(%rax)
2122 movdqa %xmm2,-96(%rax)
2126 movdqa %xmm3,-80(%rax)
2131 movdqa %xmm0,-64(%rax)
2136 movdqa %xmm1,-48(%rax)
2141 movdqa %xmm2,-32(%rax)
2145 movdqa %xmm3,-16(%rax)
2150 movdqa %xmm0,0(%rax)
2155 movdqa %xmm1,16(%rax)
2160 movdqa %xmm2,32(%rax)
2164 movdqa %xmm3,48(%rax)
2169 movdqa %xmm0,64(%rax)
2174 movdqa %xmm1,80(%rax)
2179 movdqa %xmm2,96(%rax)
2181 movdqa %xmm3,112(%rax)
2188 movdqa -128(%r11),%xmm0
2189 movdqa -112(%r11),%xmm1
2190 movdqa -96(%r11),%xmm2
2191 pand -128(%rax),%xmm0
2192 movdqa -80(%r11),%xmm3
2193 pand -112(%rax),%xmm1
2195 pand -96(%rax),%xmm2
2197 pand -80(%rax),%xmm3
2200 movdqa -64(%r11),%xmm0
2201 movdqa -48(%r11),%xmm1
2202 movdqa -32(%r11),%xmm2
2203 pand -64(%rax),%xmm0
2204 movdqa -16(%r11),%xmm3
2205 pand -48(%rax),%xmm1
2207 pand -32(%rax),%xmm2
2209 pand -16(%rax),%xmm3
2212 movdqa 0(%r11),%xmm0
2213 movdqa 16(%r11),%xmm1
2214 movdqa 32(%r11),%xmm2
2216 movdqa 48(%r11),%xmm3
2224 movdqa 64(%r11),%xmm0
2225 movdqa 80(%r11),%xmm1
2226 movdqa 96(%r11),%xmm2
2228 movdqa 112(%r11),%xmm3
2233 pand 112(%rax),%xmm3
2238 pshufd $0x4e,%xmm4,%xmm0
2247 .LSEH_end_bn_gather5:
2248 .size bn_gather5,.-bn_gather5
2253 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0