]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/openssl/arm/poly1305-armv4.S
Move generated OpenSSL assembly routines into the kernel sources.
[FreeBSD/FreeBSD.git] / sys / crypto / openssl / arm / poly1305-armv4.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
3 #include "arm_arch.h"
4
5 .text
6 #if defined(__thumb2__)
7 .syntax unified
8 .thumb
9 #else
10 .code   32
11 #endif
12
13 .globl  poly1305_emit
14 .globl  poly1305_blocks
15 .globl  poly1305_init
16 .type   poly1305_init,%function
17 .align  5
18 poly1305_init:
19 .Lpoly1305_init:
20         stmdb   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
21
22         eor     r3,r3,r3
23         cmp     r1,#0
24         str     r3,[r0,#0]              @ zero hash value
25         str     r3,[r0,#4]
26         str     r3,[r0,#8]
27         str     r3,[r0,#12]
28         str     r3,[r0,#16]
29         str     r3,[r0,#36]             @ is_base2_26
30         add     r0,r0,#20
31
32 #ifdef  __thumb2__
33         it      eq
34 #endif
35         moveq   r0,#0
36         beq     .Lno_key
37
38 #if     __ARM_MAX_ARCH__>=7
39         adr     r11,.Lpoly1305_init
40         ldr     r12,.LOPENSSL_armcap
41 #endif
42         ldrb    r4,[r1,#0]
43         mov     r10,#0x0fffffff
44         ldrb    r5,[r1,#1]
45         and     r3,r10,#-4              @ 0x0ffffffc
46         ldrb    r6,[r1,#2]
47         ldrb    r7,[r1,#3]
48         orr     r4,r4,r5,lsl#8
49         ldrb    r5,[r1,#4]
50         orr     r4,r4,r6,lsl#16
51         ldrb    r6,[r1,#5]
52         orr     r4,r4,r7,lsl#24
53         ldrb    r7,[r1,#6]
54         and     r4,r4,r10
55
56 #if     __ARM_MAX_ARCH__>=7
57         ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
58 # ifdef __APPLE__
59         ldr     r12,[r12]
60 # endif
61 #endif
62         ldrb    r8,[r1,#7]
63         orr     r5,r5,r6,lsl#8
64         ldrb    r6,[r1,#8]
65         orr     r5,r5,r7,lsl#16
66         ldrb    r7,[r1,#9]
67         orr     r5,r5,r8,lsl#24
68         ldrb    r8,[r1,#10]
69         and     r5,r5,r3
70
71 #if     __ARM_MAX_ARCH__>=7
72         tst     r12,#ARMV7_NEON         @ check for NEON
73 # ifdef __APPLE__
74         adr     r9,poly1305_blocks_neon
75         adr     r11,poly1305_blocks
76 #  ifdef __thumb2__
77         it      ne
78 #  endif
79         movne   r11,r9
80         adr     r12,poly1305_emit
81         adr     r10,poly1305_emit_neon
82 #  ifdef __thumb2__
83         it      ne
84 #  endif
85         movne   r12,r10
86 # else
87 #  ifdef __thumb2__
88         itete   eq
89 #  endif
90         addeq   r12,r11,#(poly1305_emit-.Lpoly1305_init)
91         addne   r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
92         addeq   r11,r11,#(poly1305_blocks-.Lpoly1305_init)
93         addne   r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
94 # endif
95 # ifdef __thumb2__
96         orr     r12,r12,#1      @ thumb-ify address
97         orr     r11,r11,#1
98 # endif
99 #endif
100         ldrb    r9,[r1,#11]
101         orr     r6,r6,r7,lsl#8
102         ldrb    r7,[r1,#12]
103         orr     r6,r6,r8,lsl#16
104         ldrb    r8,[r1,#13]
105         orr     r6,r6,r9,lsl#24
106         ldrb    r9,[r1,#14]
107         and     r6,r6,r3
108
109         ldrb    r10,[r1,#15]
110         orr     r7,r7,r8,lsl#8
111         str     r4,[r0,#0]
112         orr     r7,r7,r9,lsl#16
113         str     r5,[r0,#4]
114         orr     r7,r7,r10,lsl#24
115         str     r6,[r0,#8]
116         and     r7,r7,r3
117         str     r7,[r0,#12]
118 #if     __ARM_MAX_ARCH__>=7
119         stmia   r2,{r11,r12}            @ fill functions table
120         mov     r0,#1
121 #else
122         mov     r0,#0
123 #endif
124 .Lno_key:
125         ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
126 #if     __ARM_ARCH__>=5
127         bx      lr                              @ bx    lr
128 #else
129         tst     lr,#1
130         moveq   pc,lr                   @ be binary compatible with V4, yet
131 .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
132 #endif
133 .size   poly1305_init,.-poly1305_init
134 .type   poly1305_blocks,%function
135 .align  5
136 poly1305_blocks:
137 .Lpoly1305_blocks:
138         stmdb   sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
139
140         ands    r2,r2,#-16
141         beq     .Lno_data
142
143         cmp     r3,#0
144         add     r2,r2,r1                @ end pointer
145         sub     sp,sp,#32
146
147         ldmia   r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}              @ load context
148
149         str     r0,[sp,#12]             @ offload stuff
150         mov     lr,r1
151         str     r2,[sp,#16]
152         str     r10,[sp,#20]
153         str     r11,[sp,#24]
154         str     r12,[sp,#28]
155         b       .Loop
156
157 .Loop:
158 #if __ARM_ARCH__<7
159         ldrb    r0,[lr],#16             @ load input
160 # ifdef __thumb2__
161         it      hi
162 # endif
163         addhi   r8,r8,#1                @ 1<<128
164         ldrb    r1,[lr,#-15]
165         ldrb    r2,[lr,#-14]
166         ldrb    r3,[lr,#-13]
167         orr     r1,r0,r1,lsl#8
168         ldrb    r0,[lr,#-12]
169         orr     r2,r1,r2,lsl#16
170         ldrb    r1,[lr,#-11]
171         orr     r3,r2,r3,lsl#24
172         ldrb    r2,[lr,#-10]
173         adds    r4,r4,r3                @ accumulate input
174
175         ldrb    r3,[lr,#-9]
176         orr     r1,r0,r1,lsl#8
177         ldrb    r0,[lr,#-8]
178         orr     r2,r1,r2,lsl#16
179         ldrb    r1,[lr,#-7]
180         orr     r3,r2,r3,lsl#24
181         ldrb    r2,[lr,#-6]
182         adcs    r5,r5,r3
183
184         ldrb    r3,[lr,#-5]
185         orr     r1,r0,r1,lsl#8
186         ldrb    r0,[lr,#-4]
187         orr     r2,r1,r2,lsl#16
188         ldrb    r1,[lr,#-3]
189         orr     r3,r2,r3,lsl#24
190         ldrb    r2,[lr,#-2]
191         adcs    r6,r6,r3
192
193         ldrb    r3,[lr,#-1]
194         orr     r1,r0,r1,lsl#8
195         str     lr,[sp,#8]              @ offload input pointer
196         orr     r2,r1,r2,lsl#16
197         add     r10,r10,r10,lsr#2
198         orr     r3,r2,r3,lsl#24
199 #else
200         ldr     r0,[lr],#16             @ load input
201 # ifdef __thumb2__
202         it      hi
203 # endif
204         addhi   r8,r8,#1                @ padbit
205         ldr     r1,[lr,#-12]
206         ldr     r2,[lr,#-8]
207         ldr     r3,[lr,#-4]
208 # ifdef __ARMEB__
209         rev     r0,r0
210         rev     r1,r1
211         rev     r2,r2
212         rev     r3,r3
213 # endif
214         adds    r4,r4,r0                @ accumulate input
215         str     lr,[sp,#8]              @ offload input pointer
216         adcs    r5,r5,r1
217         add     r10,r10,r10,lsr#2
218         adcs    r6,r6,r2
219 #endif
220         add     r11,r11,r11,lsr#2
221         adcs    r7,r7,r3
222         add     r12,r12,r12,lsr#2
223
224         umull   r2,r3,r5,r9
225         adc     r8,r8,#0
226         umull   r0,r1,r4,r9
227         umlal   r2,r3,r8,r10
228         umlal   r0,r1,r7,r10
229         ldr     r10,[sp,#20]            @ reload r10
230         umlal   r2,r3,r6,r12
231         umlal   r0,r1,r5,r12
232         umlal   r2,r3,r7,r11
233         umlal   r0,r1,r6,r11
234         umlal   r2,r3,r4,r10
235         str     r0,[sp,#0]              @ future r4
236         mul     r0,r11,r8
237         ldr     r11,[sp,#24]            @ reload r11
238         adds    r2,r2,r1                @ d1+=d0>>32
239         eor     r1,r1,r1
240         adc     lr,r3,#0                @ future r6
241         str     r2,[sp,#4]              @ future r5
242
243         mul     r2,r12,r8
244         eor     r3,r3,r3
245         umlal   r0,r1,r7,r12
246         ldr     r12,[sp,#28]            @ reload r12
247         umlal   r2,r3,r7,r9
248         umlal   r0,r1,r6,r9
249         umlal   r2,r3,r6,r10
250         umlal   r0,r1,r5,r10
251         umlal   r2,r3,r5,r11
252         umlal   r0,r1,r4,r11
253         umlal   r2,r3,r4,r12
254         ldr     r4,[sp,#0]
255         mul     r8,r9,r8
256         ldr     r5,[sp,#4]
257
258         adds    r6,lr,r0                @ d2+=d1>>32
259         ldr     lr,[sp,#8]              @ reload input pointer
260         adc     r1,r1,#0
261         adds    r7,r2,r1                @ d3+=d2>>32
262         ldr     r0,[sp,#16]             @ reload end pointer
263         adc     r3,r3,#0
264         add     r8,r8,r3                @ h4+=d3>>32
265
266         and     r1,r8,#-4
267         and     r8,r8,#3
268         add     r1,r1,r1,lsr#2          @ *=5
269         adds    r4,r4,r1
270         adcs    r5,r5,#0
271         adcs    r6,r6,#0
272         adcs    r7,r7,#0
273         adc     r8,r8,#0
274
275         cmp     r0,lr                   @ done yet?
276         bhi     .Loop
277
278         ldr     r0,[sp,#12]
279         add     sp,sp,#32
280         stmia   r0,{r4,r5,r6,r7,r8}             @ store the result
281
282 .Lno_data:
283 #if     __ARM_ARCH__>=5
284         ldmia   sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
285 #else
286         ldmia   sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
287         tst     lr,#1
288         moveq   pc,lr                   @ be binary compatible with V4, yet
289 .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
290 #endif
291 .size   poly1305_blocks,.-poly1305_blocks
292 .type   poly1305_emit,%function
293 .align  5
294 poly1305_emit:
295         stmdb   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
296 .Lpoly1305_emit_enter:
297
298         ldmia   r0,{r3,r4,r5,r6,r7}
299         adds    r8,r3,#5                @ compare to modulus
300         adcs    r9,r4,#0
301         adcs    r10,r5,#0
302         adcs    r11,r6,#0
303         adc     r7,r7,#0
304         tst     r7,#4                   @ did it carry/borrow?
305
306 #ifdef  __thumb2__
307         it      ne
308 #endif
309         movne   r3,r8
310         ldr     r8,[r2,#0]
311 #ifdef  __thumb2__
312         it      ne
313 #endif
314         movne   r4,r9
315         ldr     r9,[r2,#4]
316 #ifdef  __thumb2__
317         it      ne
318 #endif
319         movne   r5,r10
320         ldr     r10,[r2,#8]
321 #ifdef  __thumb2__
322         it      ne
323 #endif
324         movne   r6,r11
325         ldr     r11,[r2,#12]
326
327         adds    r3,r3,r8
328         adcs    r4,r4,r9
329         adcs    r5,r5,r10
330         adc     r6,r6,r11
331
332 #if __ARM_ARCH__>=7
333 # ifdef __ARMEB__
334         rev     r3,r3
335         rev     r4,r4
336         rev     r5,r5
337         rev     r6,r6
338 # endif
339         str     r3,[r1,#0]
340         str     r4,[r1,#4]
341         str     r5,[r1,#8]
342         str     r6,[r1,#12]
343 #else
344         strb    r3,[r1,#0]
345         mov     r3,r3,lsr#8
346         strb    r4,[r1,#4]
347         mov     r4,r4,lsr#8
348         strb    r5,[r1,#8]
349         mov     r5,r5,lsr#8
350         strb    r6,[r1,#12]
351         mov     r6,r6,lsr#8
352
353         strb    r3,[r1,#1]
354         mov     r3,r3,lsr#8
355         strb    r4,[r1,#5]
356         mov     r4,r4,lsr#8
357         strb    r5,[r1,#9]
358         mov     r5,r5,lsr#8
359         strb    r6,[r1,#13]
360         mov     r6,r6,lsr#8
361
362         strb    r3,[r1,#2]
363         mov     r3,r3,lsr#8
364         strb    r4,[r1,#6]
365         mov     r4,r4,lsr#8
366         strb    r5,[r1,#10]
367         mov     r5,r5,lsr#8
368         strb    r6,[r1,#14]
369         mov     r6,r6,lsr#8
370
371         strb    r3,[r1,#3]
372         strb    r4,[r1,#7]
373         strb    r5,[r1,#11]
374         strb    r6,[r1,#15]
375 #endif
376         ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
377 #if     __ARM_ARCH__>=5
378         bx      lr                              @ bx    lr
379 #else
380         tst     lr,#1
381         moveq   pc,lr                   @ be binary compatible with V4, yet
382 .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
383 #endif
384 .size   poly1305_emit,.-poly1305_emit
385 #if     __ARM_MAX_ARCH__>=7
386 .fpu    neon
387
388 .type   poly1305_init_neon,%function
389 .align  5
390 poly1305_init_neon:
391         ldr     r4,[r0,#20]             @ load key base 2^32
392         ldr     r5,[r0,#24]
393         ldr     r6,[r0,#28]
394         ldr     r7,[r0,#32]
395
396         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
397         mov     r3,r4,lsr#26
398         mov     r4,r5,lsr#20
399         orr     r3,r3,r5,lsl#6
400         mov     r5,r6,lsr#14
401         orr     r4,r4,r6,lsl#12
402         mov     r6,r7,lsr#8
403         orr     r5,r5,r7,lsl#18
404         and     r3,r3,#0x03ffffff
405         and     r4,r4,#0x03ffffff
406         and     r5,r5,#0x03ffffff
407
408         vdup.32 d0,r2                   @ r^1 in both lanes
409         add     r2,r3,r3,lsl#2          @ *5
410         vdup.32 d1,r3
411         add     r3,r4,r4,lsl#2
412         vdup.32 d2,r2
413         vdup.32 d3,r4
414         add     r4,r5,r5,lsl#2
415         vdup.32 d4,r3
416         vdup.32 d5,r5
417         add     r5,r6,r6,lsl#2
418         vdup.32 d6,r4
419         vdup.32 d7,r6
420         vdup.32 d8,r5
421
422         mov     r5,#2           @ counter
423
424 .Lsquare_neon:
425         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
426         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
427         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
428         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
429         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
430         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
431
432         vmull.u32       q5,d0,d0[1]
433         vmull.u32       q6,d1,d0[1]
434         vmull.u32       q7,d3,d0[1]
435         vmull.u32       q8,d5,d0[1]
436         vmull.u32       q9,d7,d0[1]
437
438         vmlal.u32       q5,d7,d2[1]
439         vmlal.u32       q6,d0,d1[1]
440         vmlal.u32       q7,d1,d1[1]
441         vmlal.u32       q8,d3,d1[1]
442         vmlal.u32       q9,d5,d1[1]
443
444         vmlal.u32       q5,d5,d4[1]
445         vmlal.u32       q6,d7,d4[1]
446         vmlal.u32       q8,d1,d3[1]
447         vmlal.u32       q7,d0,d3[1]
448         vmlal.u32       q9,d3,d3[1]
449
450         vmlal.u32       q5,d3,d6[1]
451         vmlal.u32       q8,d0,d5[1]
452         vmlal.u32       q6,d5,d6[1]
453         vmlal.u32       q7,d7,d6[1]
454         vmlal.u32       q9,d1,d5[1]
455
456         vmlal.u32       q8,d7,d8[1]
457         vmlal.u32       q5,d1,d8[1]
458         vmlal.u32       q6,d3,d8[1]
459         vmlal.u32       q7,d5,d8[1]
460         vmlal.u32       q9,d0,d7[1]
461
462         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
463         @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
464         @ and P. Schwabe
465         @
466         @ H0>>+H1>>+H2>>+H3>>+H4
467         @ H3>>+H4>>*5+H0>>+H1
468         @
469         @ Trivia.
470         @
471         @ Result of multiplication of n-bit number by m-bit number is
472         @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
473         @ m-bit number multiplied by 2^n is still n+m bits wide.
474         @
475         @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
476         @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
477         @ one is n+1 bits wide.
478         @
479         @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
480         @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
481         @ can be 27. However! In cases when their width exceeds 26 bits
482         @ they are limited by 2^26+2^6. This in turn means that *sum*
483         @ of the products with these values can still be viewed as sum
484         @ of 52-bit numbers as long as the amount of addends is not a
485         @ power of 2. For example,
486         @
487         @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
488         @
489         @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
490         @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
491         @ 8 * (2^52) or 2^55. However, the value is then multiplied by
492         @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
493         @ which is less than 32 * (2^52) or 2^57. And when processing
494         @ data we are looking at triple as many addends...
495         @
496         @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
497         @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
498         @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
499         @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
500         @ instruction accepts 2x32-bit input and writes 2x64-bit result.
501         @ This means that result of reduction have to be compressed upon
502         @ loop wrap-around. This can be done in the process of reduction
503         @ to minimize amount of instructions [as well as amount of
504         @ 128-bit instructions, which benefits low-end processors], but
505         @ one has to watch for H2 (which is narrower than H0) and 5*H4
506         @ not being wider than 58 bits, so that result of right shift
507         @ by 26 bits fits in 32 bits. This is also useful on x86,
508         @ because it allows to use paddd in place for paddq, which
509         @ benefits Atom, where paddq is ridiculously slow.
510
511         vshr.u64        q15,q8,#26
512         vmovn.i64       d16,q8
513         vshr.u64        q4,q5,#26
514         vmovn.i64       d10,q5
515         vadd.i64        q9,q9,q15               @ h3 -> h4
516         vbic.i32        d16,#0xfc000000 @ &=0x03ffffff
517         vadd.i64        q6,q6,q4                @ h0 -> h1
518         vbic.i32        d10,#0xfc000000
519
520         vshrn.u64       d30,q9,#26
521         vmovn.i64       d18,q9
522         vshr.u64        q4,q6,#26
523         vmovn.i64       d12,q6
524         vadd.i64        q7,q7,q4                @ h1 -> h2
525         vbic.i32        d18,#0xfc000000
526         vbic.i32        d12,#0xfc000000
527
528         vadd.i32        d10,d10,d30
529         vshl.u32        d30,d30,#2
530         vshrn.u64       d8,q7,#26
531         vmovn.i64       d14,q7
532         vadd.i32        d10,d10,d30     @ h4 -> h0
533         vadd.i32        d16,d16,d8      @ h2 -> h3
534         vbic.i32        d14,#0xfc000000
535
536         vshr.u32        d30,d10,#26
537         vbic.i32        d10,#0xfc000000
538         vshr.u32        d8,d16,#26
539         vbic.i32        d16,#0xfc000000
540         vadd.i32        d12,d12,d30     @ h0 -> h1
541         vadd.i32        d18,d18,d8      @ h3 -> h4
542
543         subs    r5,r5,#1
544         beq     .Lsquare_break_neon
545
546         add     r6,r0,#(48+0*9*4)
547         add     r7,r0,#(48+1*9*4)
548
549         vtrn.32 d0,d10          @ r^2:r^1
550         vtrn.32 d3,d14
551         vtrn.32 d5,d16
552         vtrn.32 d1,d12
553         vtrn.32 d7,d18
554
555         vshl.u32        d4,d3,#2                @ *5
556         vshl.u32        d6,d5,#2
557         vshl.u32        d2,d1,#2
558         vshl.u32        d8,d7,#2
559         vadd.i32        d4,d4,d3
560         vadd.i32        d2,d2,d1
561         vadd.i32        d6,d6,d5
562         vadd.i32        d8,d8,d7
563
564         vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
565         vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
566         vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
567         vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
568         vst1.32 {d8[0]},[r6,:32]
569         vst1.32 {d8[1]},[r7,:32]
570
571         b       .Lsquare_neon
572
573 .align  4
574 .Lsquare_break_neon:
575         add     r6,r0,#(48+2*4*9)
576         add     r7,r0,#(48+3*4*9)
577
578         vmov    d0,d10          @ r^4:r^3
579         vshl.u32        d2,d12,#2               @ *5
580         vmov    d1,d12
581         vshl.u32        d4,d14,#2
582         vmov    d3,d14
583         vshl.u32        d6,d16,#2
584         vmov    d5,d16
585         vshl.u32        d8,d18,#2
586         vmov    d7,d18
587         vadd.i32        d2,d2,d12
588         vadd.i32        d4,d4,d14
589         vadd.i32        d6,d6,d16
590         vadd.i32        d8,d8,d18
591
592         vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
593         vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
594         vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
595         vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
596         vst1.32 {d8[0]},[r6]
597         vst1.32 {d8[1]},[r7]
598
599         bx      lr                              @ bx    lr
600 .size   poly1305_init_neon,.-poly1305_init_neon
601
602 .type   poly1305_blocks_neon,%function
603 .align  5
604 poly1305_blocks_neon:
605         ldr     ip,[r0,#36]             @ is_base2_26
606         ands    r2,r2,#-16
607         beq     .Lno_data_neon
608
609         cmp     r2,#64
610         bhs     .Lenter_neon
611         tst     ip,ip                   @ is_base2_26?
612         beq     .Lpoly1305_blocks
613
614 .Lenter_neon:
615         stmdb   sp!,{r4,r5,r6,r7}
616         vstmdb  sp!,{d8,d9,d10,d11,d12,d13,d14,d15}             @ ABI specification says so
617
618         tst     ip,ip                   @ is_base2_26?
619         bne     .Lbase2_26_neon
620
621         stmdb   sp!,{r1,r2,r3,lr}
622         bl      poly1305_init_neon
623
624         ldr     r4,[r0,#0]              @ load hash value base 2^32
625         ldr     r5,[r0,#4]
626         ldr     r6,[r0,#8]
627         ldr     r7,[r0,#12]
628         ldr     ip,[r0,#16]
629
630         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
631         mov     r3,r4,lsr#26
632         veor    d10,d10,d10
633         mov     r4,r5,lsr#20
634         orr     r3,r3,r5,lsl#6
635         veor    d12,d12,d12
636         mov     r5,r6,lsr#14
637         orr     r4,r4,r6,lsl#12
638         veor    d14,d14,d14
639         mov     r6,r7,lsr#8
640         orr     r5,r5,r7,lsl#18
641         veor    d16,d16,d16
642         and     r3,r3,#0x03ffffff
643         orr     r6,r6,ip,lsl#24
644         veor    d18,d18,d18
645         and     r4,r4,#0x03ffffff
646         mov     r1,#1
647         and     r5,r5,#0x03ffffff
648         str     r1,[r0,#36]             @ is_base2_26
649
650         vmov.32 d10[0],r2
651         vmov.32 d12[0],r3
652         vmov.32 d14[0],r4
653         vmov.32 d16[0],r5
654         vmov.32 d18[0],r6
655         adr     r5,.Lzeros
656
657         ldmia   sp!,{r1,r2,r3,lr}
658         b       .Lbase2_32_neon
659
660 .align  4
661 .Lbase2_26_neon:
662         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
663         @ load hash value
664
665         veor    d10,d10,d10
666         veor    d12,d12,d12
667         veor    d14,d14,d14
668         veor    d16,d16,d16
669         veor    d18,d18,d18
670         vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
671         adr     r5,.Lzeros
672         vld1.32 {d18[0]},[r0]
673         sub     r0,r0,#16               @ rewind
674
675 .Lbase2_32_neon:
676         add     r4,r1,#32
677         mov     r3,r3,lsl#24
678         tst     r2,#31
679         beq     .Leven
680
681         vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
682         vmov.32 d28[0],r3
683         sub     r2,r2,#16
684         add     r4,r1,#32
685
686 # ifdef __ARMEB__
687         vrev32.8        q10,q10
688         vrev32.8        q13,q13
689         vrev32.8        q11,q11
690         vrev32.8        q12,q12
691 # endif
692         vsri.u32        d28,d26,#8      @ base 2^32 -> base 2^26
693         vshl.u32        d26,d26,#18
694
695         vsri.u32        d26,d24,#14
696         vshl.u32        d24,d24,#12
697         vadd.i32        d29,d28,d18     @ add hash value and move to #hi
698
699         vbic.i32        d26,#0xfc000000
700         vsri.u32        d24,d22,#20
701         vshl.u32        d22,d22,#6
702
703         vbic.i32        d24,#0xfc000000
704         vsri.u32        d22,d20,#26
705         vadd.i32        d27,d26,d16
706
707         vbic.i32        d20,#0xfc000000
708         vbic.i32        d22,#0xfc000000
709         vadd.i32        d25,d24,d14
710
711         vadd.i32        d21,d20,d10
712         vadd.i32        d23,d22,d12
713
714         mov     r7,r5
715         add     r6,r0,#48
716
717         cmp     r2,r2
718         b       .Long_tail
719
720 .align  4
721 .Leven:
722         subs    r2,r2,#64
723         it      lo
724         movlo   r4,r5
725
726         vmov.i32        q14,#1<<24              @ padbit, yes, always
727         vld4.32 {d20,d22,d24,d26},[r1]  @ inp[0:1]
728         add     r1,r1,#64
729         vld4.32 {d21,d23,d25,d27},[r4]  @ inp[2:3] (or 0)
730         add     r4,r4,#64
731         itt     hi
732         addhi   r7,r0,#(48+1*9*4)
733         addhi   r6,r0,#(48+3*9*4)
734
735 # ifdef __ARMEB__
736         vrev32.8        q10,q10
737         vrev32.8        q13,q13
738         vrev32.8        q11,q11
739         vrev32.8        q12,q12
740 # endif
741         vsri.u32        q14,q13,#8              @ base 2^32 -> base 2^26
742         vshl.u32        q13,q13,#18
743
744         vsri.u32        q13,q12,#14
745         vshl.u32        q12,q12,#12
746
747         vbic.i32        q13,#0xfc000000
748         vsri.u32        q12,q11,#20
749         vshl.u32        q11,q11,#6
750
751         vbic.i32        q12,#0xfc000000
752         vsri.u32        q11,q10,#26
753
754         vbic.i32        q10,#0xfc000000
755         vbic.i32        q11,#0xfc000000
756
757         bls     .Lskip_loop
758
759         vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
760         vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
761         vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
762         vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
763         b       .Loop_neon
764
765 .align  5
766 .Loop_neon:
767         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
768         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
769         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
770         @   ___________________/
771         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
772         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
773         @   ___________________/ ____________________/
774         @
775         @ Note that we start with inp[2:3]*r^2. This is because it
776         @ doesn't depend on reduction in previous iteration.
777         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
778         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
779         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
780         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
781         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
782         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
783
784         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
785         @ inp[2:3]*r^2
786
787         vadd.i32        d24,d24,d14     @ accumulate inp[0:1]
788         vmull.u32       q7,d25,d0[1]
789         vadd.i32        d20,d20,d10
790         vmull.u32       q5,d21,d0[1]
791         vadd.i32        d26,d26,d16
792         vmull.u32       q8,d27,d0[1]
793         vmlal.u32       q7,d23,d1[1]
794         vadd.i32        d22,d22,d12
795         vmull.u32       q6,d23,d0[1]
796
797         vadd.i32        d28,d28,d18
798         vmull.u32       q9,d29,d0[1]
799         subs    r2,r2,#64
800         vmlal.u32       q5,d29,d2[1]
801         it      lo
802         movlo   r4,r5
803         vmlal.u32       q8,d25,d1[1]
804         vld1.32 d8[1],[r7,:32]
805         vmlal.u32       q6,d21,d1[1]
806         vmlal.u32       q9,d27,d1[1]
807
808         vmlal.u32       q5,d27,d4[1]
809         vmlal.u32       q8,d23,d3[1]
810         vmlal.u32       q9,d25,d3[1]
811         vmlal.u32       q6,d29,d4[1]
812         vmlal.u32       q7,d21,d3[1]
813
814         vmlal.u32       q8,d21,d5[1]
815         vmlal.u32       q5,d25,d6[1]
816         vmlal.u32       q9,d23,d5[1]
817         vmlal.u32       q6,d27,d6[1]
818         vmlal.u32       q7,d29,d6[1]
819
820         vmlal.u32       q8,d29,d8[1]
821         vmlal.u32       q5,d23,d8[1]
822         vmlal.u32       q9,d21,d7[1]
823         vmlal.u32       q6,d25,d8[1]
824         vmlal.u32       q7,d27,d8[1]
825
826         vld4.32 {d21,d23,d25,d27},[r4]  @ inp[2:3] (or 0)
827         add     r4,r4,#64
828
829         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
830         @ (hash+inp[0:1])*r^4 and accumulate
831
832         vmlal.u32       q8,d26,d0[0]
833         vmlal.u32       q5,d20,d0[0]
834         vmlal.u32       q9,d28,d0[0]
835         vmlal.u32       q6,d22,d0[0]
836         vmlal.u32       q7,d24,d0[0]
837         vld1.32 d8[0],[r6,:32]
838
839         vmlal.u32       q8,d24,d1[0]
840         vmlal.u32       q5,d28,d2[0]
841         vmlal.u32       q9,d26,d1[0]
842         vmlal.u32       q6,d20,d1[0]
843         vmlal.u32       q7,d22,d1[0]
844
845         vmlal.u32       q8,d22,d3[0]
846         vmlal.u32       q5,d26,d4[0]
847         vmlal.u32       q9,d24,d3[0]
848         vmlal.u32       q6,d28,d4[0]
849         vmlal.u32       q7,d20,d3[0]
850
851         vmlal.u32       q8,d20,d5[0]
852         vmlal.u32       q5,d24,d6[0]
853         vmlal.u32       q9,d22,d5[0]
854         vmlal.u32       q6,d26,d6[0]
855         vmlal.u32       q8,d28,d8[0]
856
857         vmlal.u32       q7,d28,d6[0]
858         vmlal.u32       q5,d22,d8[0]
859         vmlal.u32       q9,d20,d7[0]
860         vmov.i32        q14,#1<<24              @ padbit, yes, always
861         vmlal.u32       q6,d24,d8[0]
862         vmlal.u32       q7,d26,d8[0]
863
864         vld4.32 {d20,d22,d24,d26},[r1]  @ inp[0:1]
865         add     r1,r1,#64
866 # ifdef __ARMEB__
867         vrev32.8        q10,q10
868         vrev32.8        q11,q11
869         vrev32.8        q12,q12
870         vrev32.8        q13,q13
871 # endif
872
873         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
874         @ lazy reduction interleaved with base 2^32 -> base 2^26 of
875         @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
876
877         vshr.u64        q15,q8,#26
878         vmovn.i64       d16,q8
879         vshr.u64        q4,q5,#26
880         vmovn.i64       d10,q5
881         vadd.i64        q9,q9,q15               @ h3 -> h4
882         vbic.i32        d16,#0xfc000000
883         vsri.u32        q14,q13,#8              @ base 2^32 -> base 2^26
884         vadd.i64        q6,q6,q4                @ h0 -> h1
885         vshl.u32        q13,q13,#18
886         vbic.i32        d10,#0xfc000000
887
888         vshrn.u64       d30,q9,#26
889         vmovn.i64       d18,q9
890         vshr.u64        q4,q6,#26
891         vmovn.i64       d12,q6
892         vadd.i64        q7,q7,q4                @ h1 -> h2
893         vsri.u32        q13,q12,#14
894         vbic.i32        d18,#0xfc000000
895         vshl.u32        q12,q12,#12
896         vbic.i32        d12,#0xfc000000
897
898         vadd.i32        d10,d10,d30
899         vshl.u32        d30,d30,#2
900         vbic.i32        q13,#0xfc000000
901         vshrn.u64       d8,q7,#26
902         vmovn.i64       d14,q7
903         vaddl.u32       q5,d10,d30      @ h4 -> h0 [widen for a sec]
904         vsri.u32        q12,q11,#20
905         vadd.i32        d16,d16,d8      @ h2 -> h3
906         vshl.u32        q11,q11,#6
907         vbic.i32        d14,#0xfc000000
908         vbic.i32        q12,#0xfc000000
909
910         vshrn.u64       d30,q5,#26              @ re-narrow
911         vmovn.i64       d10,q5
912         vsri.u32        q11,q10,#26
913         vbic.i32        q10,#0xfc000000
914         vshr.u32        d8,d16,#26
915         vbic.i32        d16,#0xfc000000
916         vbic.i32        d10,#0xfc000000
917         vadd.i32        d12,d12,d30     @ h0 -> h1
918         vadd.i32        d18,d18,d8      @ h3 -> h4
919         vbic.i32        q11,#0xfc000000
920
921         bhi     .Loop_neon
922
923 .Lskip_loop:
924         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
925         @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
926
927         add     r7,r0,#(48+0*9*4)
928         add     r6,r0,#(48+1*9*4)
929         adds    r2,r2,#32
930         it      ne
931         movne   r2,#0
932         bne     .Long_tail
933
934         vadd.i32        d25,d24,d14     @ add hash value and move to #hi
935         vadd.i32        d21,d20,d10
936         vadd.i32        d27,d26,d16
937         vadd.i32        d23,d22,d12
938         vadd.i32        d29,d28,d18
939
940 .Long_tail:
941         vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
942         vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
943
944         vadd.i32        d24,d24,d14     @ can be redundant
945         vmull.u32       q7,d25,d0
946         vadd.i32        d20,d20,d10
947         vmull.u32       q5,d21,d0
948         vadd.i32        d26,d26,d16
949         vmull.u32       q8,d27,d0
950         vadd.i32        d22,d22,d12
951         vmull.u32       q6,d23,d0
952         vadd.i32        d28,d28,d18
953         vmull.u32       q9,d29,d0
954
955         vmlal.u32       q5,d29,d2
956         vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
957         vmlal.u32       q8,d25,d1
958         vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
959         vmlal.u32       q6,d21,d1
960         vmlal.u32       q9,d27,d1
961         vmlal.u32       q7,d23,d1
962
963         vmlal.u32       q8,d23,d3
964         vld1.32 d8[1],[r7,:32]
965         vmlal.u32       q5,d27,d4
966         vld1.32 d8[0],[r6,:32]
967         vmlal.u32       q9,d25,d3
968         vmlal.u32       q6,d29,d4
969         vmlal.u32       q7,d21,d3
970
971         vmlal.u32       q8,d21,d5
972         it      ne
973         addne   r7,r0,#(48+2*9*4)
974         vmlal.u32       q5,d25,d6
975         it      ne
976         addne   r6,r0,#(48+3*9*4)
977         vmlal.u32       q9,d23,d5
978         vmlal.u32       q6,d27,d6
979         vmlal.u32       q7,d29,d6
980
981         vmlal.u32       q8,d29,d8
982         vorn    q0,q0,q0        @ all-ones, can be redundant
983         vmlal.u32       q5,d23,d8
984         vshr.u64        q0,q0,#38
985         vmlal.u32       q9,d21,d7
986         vmlal.u32       q6,d25,d8
987         vmlal.u32       q7,d27,d8
988
989         beq     .Lshort_tail
990
991         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
992         @ (hash+inp[0:1])*r^4:r^3 and accumulate
993
994         vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
995         vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
996
997         vmlal.u32       q7,d24,d0
998         vmlal.u32       q5,d20,d0
999         vmlal.u32       q8,d26,d0
1000         vmlal.u32       q6,d22,d0
1001         vmlal.u32       q9,d28,d0
1002
1003         vmlal.u32       q5,d28,d2
1004         vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1005         vmlal.u32       q8,d24,d1
1006         vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1007         vmlal.u32       q6,d20,d1
1008         vmlal.u32       q9,d26,d1
1009         vmlal.u32       q7,d22,d1
1010
1011         vmlal.u32       q8,d22,d3
1012         vld1.32 d8[1],[r7,:32]
1013         vmlal.u32       q5,d26,d4
1014         vld1.32 d8[0],[r6,:32]
1015         vmlal.u32       q9,d24,d3
1016         vmlal.u32       q6,d28,d4
1017         vmlal.u32       q7,d20,d3
1018
1019         vmlal.u32       q8,d20,d5
1020         vmlal.u32       q5,d24,d6
1021         vmlal.u32       q9,d22,d5
1022         vmlal.u32       q6,d26,d6
1023         vmlal.u32       q7,d28,d6
1024
1025         vmlal.u32       q8,d28,d8
1026         vorn    q0,q0,q0        @ all-ones
1027         vmlal.u32       q5,d22,d8
1028         vshr.u64        q0,q0,#38
1029         vmlal.u32       q9,d20,d7
1030         vmlal.u32       q6,d24,d8
1031         vmlal.u32       q7,d26,d8
1032
1033 .Lshort_tail:
1034         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1035         @ horizontal addition
1036
1037         vadd.i64        d16,d16,d17
1038         vadd.i64        d10,d10,d11
1039         vadd.i64        d18,d18,d19
1040         vadd.i64        d12,d12,d13
1041         vadd.i64        d14,d14,d15
1042
1043         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1044         @ lazy reduction, but without narrowing
1045
1046         vshr.u64        q15,q8,#26
1047         vand.i64        q8,q8,q0
1048         vshr.u64        q4,q5,#26
1049         vand.i64        q5,q5,q0
1050         vadd.i64        q9,q9,q15               @ h3 -> h4
1051         vadd.i64        q6,q6,q4                @ h0 -> h1
1052
1053         vshr.u64        q15,q9,#26
1054         vand.i64        q9,q9,q0
1055         vshr.u64        q4,q6,#26
1056         vand.i64        q6,q6,q0
1057         vadd.i64        q7,q7,q4                @ h1 -> h2
1058
1059         vadd.i64        q5,q5,q15
1060         vshl.u64        q15,q15,#2
1061         vshr.u64        q4,q7,#26
1062         vand.i64        q7,q7,q0
1063         vadd.i64        q5,q5,q15               @ h4 -> h0
1064         vadd.i64        q8,q8,q4                @ h2 -> h3
1065
1066         vshr.u64        q15,q5,#26
1067         vand.i64        q5,q5,q0
1068         vshr.u64        q4,q8,#26
1069         vand.i64        q8,q8,q0
1070         vadd.i64        q6,q6,q15               @ h0 -> h1
1071         vadd.i64        q9,q9,q4                @ h3 -> h4
1072
1073         cmp     r2,#0
1074         bne     .Leven
1075
1076         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1077         @ store hash value
1078
1079         vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
1080         vst1.32 {d18[0]},[r0]
1081
1082         vldmia  sp!,{d8,d9,d10,d11,d12,d13,d14,d15}                     @ epilogue
1083         ldmia   sp!,{r4,r5,r6,r7}
1084 .Lno_data_neon:
1085         bx      lr                                      @ bx    lr
1086 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
1087
1088 .type   poly1305_emit_neon,%function
1089 .align  5
1090 poly1305_emit_neon:
1091         ldr     ip,[r0,#36]             @ is_base2_26
1092
1093         stmdb   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1094
1095         tst     ip,ip
1096         beq     .Lpoly1305_emit_enter
1097
1098         ldmia   r0,{r3,r4,r5,r6,r7}
1099         eor     r8,r8,r8
1100
1101         adds    r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
1102         mov     r4,r4,lsr#6
1103         adcs    r4,r4,r5,lsl#20
1104         mov     r5,r5,lsr#12
1105         adcs    r5,r5,r6,lsl#14
1106         mov     r6,r6,lsr#18
1107         adcs    r6,r6,r7,lsl#8
1108         adc     r7,r8,r7,lsr#24 @ can be partially reduced ...
1109
1110         and     r8,r7,#-4               @ ... so reduce
1111         and     r7,r6,#3
1112         add     r8,r8,r8,lsr#2  @ *= 5
1113         adds    r3,r3,r8
1114         adcs    r4,r4,#0
1115         adcs    r5,r5,#0
1116         adcs    r6,r6,#0
1117         adc     r7,r7,#0
1118
1119         adds    r8,r3,#5                @ compare to modulus
1120         adcs    r9,r4,#0
1121         adcs    r10,r5,#0
1122         adcs    r11,r6,#0
1123         adc     r7,r7,#0
1124         tst     r7,#4                   @ did it carry/borrow?
1125
1126         it      ne
1127         movne   r3,r8
1128         ldr     r8,[r2,#0]
1129         it      ne
1130         movne   r4,r9
1131         ldr     r9,[r2,#4]
1132         it      ne
1133         movne   r5,r10
1134         ldr     r10,[r2,#8]
1135         it      ne
1136         movne   r6,r11
1137         ldr     r11,[r2,#12]
1138
1139         adds    r3,r3,r8                @ accumulate nonce
1140         adcs    r4,r4,r9
1141         adcs    r5,r5,r10
1142         adc     r6,r6,r11
1143
1144 # ifdef __ARMEB__
1145         rev     r3,r3
1146         rev     r4,r4
1147         rev     r5,r5
1148         rev     r6,r6
1149 # endif
1150         str     r3,[r1,#0]              @ store the result
1151         str     r4,[r1,#4]
1152         str     r5,[r1,#8]
1153         str     r6,[r1,#12]
1154
1155         ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1156         bx      lr                              @ bx    lr
1157 .size   poly1305_emit_neon,.-poly1305_emit_neon
1158
1159 .align  5
1160 .Lzeros:
1161 .long   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1162 .LOPENSSL_armcap:
1163 .word   OPENSSL_armcap_P-.Lpoly1305_init
1164 #endif
1165 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1166 .align  2
1167 .align  2
1168 #if     __ARM_MAX_ARCH__>=7
1169 .comm   OPENSSL_armcap_P,4,4
1170 #endif