]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/arm/poly1305-armv4.S
OpenSSL: Regen assembly files and manual pages for OpenSSL 1.1.1j
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / arm / poly1305-armv4.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
3 #include "arm_arch.h"
4
5 .text
6 #if defined(__thumb2__)
7 .syntax unified
8 .thumb
9 #else
10 .code   32
11 #endif
12
13 .globl  poly1305_emit
14 .globl  poly1305_blocks
15 .globl  poly1305_init
16 .type   poly1305_init,%function
17 .align  5
18 poly1305_init:
19 .Lpoly1305_init:
20         stmdb   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
21
22         eor     r3,r3,r3
23         cmp     r1,#0
24         str     r3,[r0,#0]              @ zero hash value
25         str     r3,[r0,#4]
26         str     r3,[r0,#8]
27         str     r3,[r0,#12]
28         str     r3,[r0,#16]
29         str     r3,[r0,#36]             @ is_base2_26
30         add     r0,r0,#20
31
32 #ifdef  __thumb2__
33         it      eq
34 #endif
35         moveq   r0,#0
36         beq     .Lno_key
37
38 #if     __ARM_MAX_ARCH__>=7
39         adr     r11,.Lpoly1305_init
40         ldr     r12,.LOPENSSL_armcap
41 #endif
42         ldrb    r4,[r1,#0]
43         mov     r10,#0x0fffffff
44         ldrb    r5,[r1,#1]
45         and     r3,r10,#-4              @ 0x0ffffffc
46         ldrb    r6,[r1,#2]
47         ldrb    r7,[r1,#3]
48         orr     r4,r4,r5,lsl#8
49         ldrb    r5,[r1,#4]
50         orr     r4,r4,r6,lsl#16
51         ldrb    r6,[r1,#5]
52         orr     r4,r4,r7,lsl#24
53         ldrb    r7,[r1,#6]
54         and     r4,r4,r10
55
56 #if     __ARM_MAX_ARCH__>=7
57         ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
58 # ifdef __APPLE__
59         ldr     r12,[r12]
60 # endif
61 #endif
62         ldrb    r8,[r1,#7]
63         orr     r5,r5,r6,lsl#8
64         ldrb    r6,[r1,#8]
65         orr     r5,r5,r7,lsl#16
66         ldrb    r7,[r1,#9]
67         orr     r5,r5,r8,lsl#24
68         ldrb    r8,[r1,#10]
69         and     r5,r5,r3
70
71 #if     __ARM_MAX_ARCH__>=7
72         tst     r12,#ARMV7_NEON         @ check for NEON
73 # ifdef __APPLE__
74         adr     r9,poly1305_blocks_neon
75         adr     r11,poly1305_blocks
76 #  ifdef __thumb2__
77         it      ne
78 #  endif
79         movne   r11,r9
80         adr     r12,poly1305_emit
81         adr     r10,poly1305_emit_neon
82 #  ifdef __thumb2__
83         it      ne
84 #  endif
85         movne   r12,r10
86 # else
87 #  ifdef __thumb2__
88         itete   eq
89 #  endif
90         addeq   r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
91         addne   r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
92         addeq   r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
93         addne   r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
94 # endif
95 # ifdef __thumb2__
96         orr     r12,r12,#1      @ thumb-ify address
97         orr     r11,r11,#1
98 # endif
99 #endif
100         ldrb    r9,[r1,#11]
101         orr     r6,r6,r7,lsl#8
102         ldrb    r7,[r1,#12]
103         orr     r6,r6,r8,lsl#16
104         ldrb    r8,[r1,#13]
105         orr     r6,r6,r9,lsl#24
106         ldrb    r9,[r1,#14]
107         and     r6,r6,r3
108
109         ldrb    r10,[r1,#15]
110         orr     r7,r7,r8,lsl#8
111         str     r4,[r0,#0]
112         orr     r7,r7,r9,lsl#16
113         str     r5,[r0,#4]
114         orr     r7,r7,r10,lsl#24
115         str     r6,[r0,#8]
116         and     r7,r7,r3
117         str     r7,[r0,#12]
118 #if     __ARM_MAX_ARCH__>=7
119         stmia   r2,{r11,r12}            @ fill functions table
120         mov     r0,#1
121 #else
122         mov     r0,#0
123 #endif
124 .Lno_key:
125         ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
126 #if     __ARM_ARCH__>=5
127         bx      lr                              @ bx    lr
128 #else
129         tst     lr,#1
130         moveq   pc,lr                   @ be binary compatible with V4, yet
131 .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
132 #endif
133 .size   poly1305_init,.-poly1305_init
134 .type   poly1305_blocks,%function
135 .align  5
136 poly1305_blocks:
137 .Lpoly1305_blocks:
138         stmdb   sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
139
140         ands    r2,r2,#-16
141         beq     .Lno_data
142
143         cmp     r3,#0
144         add     r2,r2,r1                @ end pointer
145         sub     sp,sp,#32
146
147         ldmia   r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}              @ load context
148
149         str     r0,[sp,#12]             @ offload stuff
150         mov     lr,r1
151         str     r2,[sp,#16]
152         str     r10,[sp,#20]
153         str     r11,[sp,#24]
154         str     r12,[sp,#28]
155         b       .Loop
156
157 .Loop:
158 #if __ARM_ARCH__<7
159         ldrb    r0,[lr],#16             @ load input
160 # ifdef __thumb2__
161         it      hi
162 # endif
163         addhi   r8,r8,#1                @ 1<<128
164         ldrb    r1,[lr,#-15]
165         ldrb    r2,[lr,#-14]
166         ldrb    r3,[lr,#-13]
167         orr     r1,r0,r1,lsl#8
168         ldrb    r0,[lr,#-12]
169         orr     r2,r1,r2,lsl#16
170         ldrb    r1,[lr,#-11]
171         orr     r3,r2,r3,lsl#24
172         ldrb    r2,[lr,#-10]
173         adds    r4,r4,r3                @ accumulate input
174
175         ldrb    r3,[lr,#-9]
176         orr     r1,r0,r1,lsl#8
177         ldrb    r0,[lr,#-8]
178         orr     r2,r1,r2,lsl#16
179         ldrb    r1,[lr,#-7]
180         orr     r3,r2,r3,lsl#24
181         ldrb    r2,[lr,#-6]
182         adcs    r5,r5,r3
183
184         ldrb    r3,[lr,#-5]
185         orr     r1,r0,r1,lsl#8
186         ldrb    r0,[lr,#-4]
187         orr     r2,r1,r2,lsl#16
188         ldrb    r1,[lr,#-3]
189         orr     r3,r2,r3,lsl#24
190         ldrb    r2,[lr,#-2]
191         adcs    r6,r6,r3
192
193         ldrb    r3,[lr,#-1]
194         orr     r1,r0,r1,lsl#8
195         str     lr,[sp,#8]              @ offload input pointer
196         orr     r2,r1,r2,lsl#16
197         add     r10,r10,r10,lsr#2
198         orr     r3,r2,r3,lsl#24
199 #else
200         ldr     r0,[lr],#16             @ load input
201 # ifdef __thumb2__
202         it      hi
203 # endif
204         addhi   r8,r8,#1                @ padbit
205         ldr     r1,[lr,#-12]
206         ldr     r2,[lr,#-8]
207         ldr     r3,[lr,#-4]
208 # ifdef __ARMEB__
209         rev     r0,r0
210         rev     r1,r1
211         rev     r2,r2
212         rev     r3,r3
213 # endif
214         adds    r4,r4,r0                @ accumulate input
215         str     lr,[sp,#8]              @ offload input pointer
216         adcs    r5,r5,r1
217         add     r10,r10,r10,lsr#2
218         adcs    r6,r6,r2
219 #endif
220         add     r11,r11,r11,lsr#2
221         adcs    r7,r7,r3
222         add     r12,r12,r12,lsr#2
223
224         umull   r2,r3,r5,r9
225         adc     r8,r8,#0
226         umull   r0,r1,r4,r9
227         umlal   r2,r3,r8,r10
228         umlal   r0,r1,r7,r10
229         ldr     r10,[sp,#20]            @ reload r10
230         umlal   r2,r3,r6,r12
231         umlal   r0,r1,r5,r12
232         umlal   r2,r3,r7,r11
233         umlal   r0,r1,r6,r11
234         umlal   r2,r3,r4,r10
235         str     r0,[sp,#0]              @ future r4
236         mul     r0,r11,r8
237         ldr     r11,[sp,#24]            @ reload r11
238         adds    r2,r2,r1                @ d1+=d0>>32
239         eor     r1,r1,r1
240         adc     lr,r3,#0                @ future r6
241         str     r2,[sp,#4]              @ future r5
242
243         mul     r2,r12,r8
244         eor     r3,r3,r3
245         umlal   r0,r1,r7,r12
246         ldr     r12,[sp,#28]            @ reload r12
247         umlal   r2,r3,r7,r9
248         umlal   r0,r1,r6,r9
249         umlal   r2,r3,r6,r10
250         umlal   r0,r1,r5,r10
251         umlal   r2,r3,r5,r11
252         umlal   r0,r1,r4,r11
253         umlal   r2,r3,r4,r12
254         ldr     r4,[sp,#0]
255         mul     r8,r9,r8
256         ldr     r5,[sp,#4]
257
258         adds    r6,lr,r0                @ d2+=d1>>32
259         ldr     lr,[sp,#8]              @ reload input pointer
260         adc     r1,r1,#0
261         adds    r7,r2,r1                @ d3+=d2>>32
262         ldr     r0,[sp,#16]             @ reload end pointer
263         adc     r3,r3,#0
264         add     r8,r8,r3                @ h4+=d3>>32
265
266         and     r1,r8,#-4
267         and     r8,r8,#3
268         add     r1,r1,r1,lsr#2          @ *=5
269         adds    r4,r4,r1
270         adcs    r5,r5,#0
271         adcs    r6,r6,#0
272         adcs    r7,r7,#0
273         adc     r8,r8,#0
274
275         cmp     r0,lr                   @ done yet?
276         bhi     .Loop
277
278         ldr     r0,[sp,#12]
279         add     sp,sp,#32
280         stmia   r0,{r4,r5,r6,r7,r8}             @ store the result
281
282 .Lno_data:
283 #if     __ARM_ARCH__>=5
284         ldmia   sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
285 #else
286         ldmia   sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
287         tst     lr,#1
288         moveq   pc,lr                   @ be binary compatible with V4, yet
289 .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
290 #endif
291 .size   poly1305_blocks,.-poly1305_blocks
292 .type   poly1305_emit,%function
293 .align  5
294 poly1305_emit:
295 .Lpoly1305_emit:
296         stmdb   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
297 .Lpoly1305_emit_enter:
298
299         ldmia   r0,{r3,r4,r5,r6,r7}
300         adds    r8,r3,#5                @ compare to modulus
301         adcs    r9,r4,#0
302         adcs    r10,r5,#0
303         adcs    r11,r6,#0
304         adc     r7,r7,#0
305         tst     r7,#4                   @ did it carry/borrow?
306
307 #ifdef  __thumb2__
308         it      ne
309 #endif
310         movne   r3,r8
311         ldr     r8,[r2,#0]
312 #ifdef  __thumb2__
313         it      ne
314 #endif
315         movne   r4,r9
316         ldr     r9,[r2,#4]
317 #ifdef  __thumb2__
318         it      ne
319 #endif
320         movne   r5,r10
321         ldr     r10,[r2,#8]
322 #ifdef  __thumb2__
323         it      ne
324 #endif
325         movne   r6,r11
326         ldr     r11,[r2,#12]
327
328         adds    r3,r3,r8
329         adcs    r4,r4,r9
330         adcs    r5,r5,r10
331         adc     r6,r6,r11
332
333 #if __ARM_ARCH__>=7
334 # ifdef __ARMEB__
335         rev     r3,r3
336         rev     r4,r4
337         rev     r5,r5
338         rev     r6,r6
339 # endif
340         str     r3,[r1,#0]
341         str     r4,[r1,#4]
342         str     r5,[r1,#8]
343         str     r6,[r1,#12]
344 #else
345         strb    r3,[r1,#0]
346         mov     r3,r3,lsr#8
347         strb    r4,[r1,#4]
348         mov     r4,r4,lsr#8
349         strb    r5,[r1,#8]
350         mov     r5,r5,lsr#8
351         strb    r6,[r1,#12]
352         mov     r6,r6,lsr#8
353
354         strb    r3,[r1,#1]
355         mov     r3,r3,lsr#8
356         strb    r4,[r1,#5]
357         mov     r4,r4,lsr#8
358         strb    r5,[r1,#9]
359         mov     r5,r5,lsr#8
360         strb    r6,[r1,#13]
361         mov     r6,r6,lsr#8
362
363         strb    r3,[r1,#2]
364         mov     r3,r3,lsr#8
365         strb    r4,[r1,#6]
366         mov     r4,r4,lsr#8
367         strb    r5,[r1,#10]
368         mov     r5,r5,lsr#8
369         strb    r6,[r1,#14]
370         mov     r6,r6,lsr#8
371
372         strb    r3,[r1,#3]
373         strb    r4,[r1,#7]
374         strb    r5,[r1,#11]
375         strb    r6,[r1,#15]
376 #endif
377         ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
378 #if     __ARM_ARCH__>=5
379         bx      lr                              @ bx    lr
380 #else
381         tst     lr,#1
382         moveq   pc,lr                   @ be binary compatible with V4, yet
383 .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
384 #endif
385 .size   poly1305_emit,.-poly1305_emit
386 #if     __ARM_MAX_ARCH__>=7
387 .fpu    neon
388
389 .type   poly1305_init_neon,%function
390 .align  5
391 poly1305_init_neon:
392         ldr     r4,[r0,#20]             @ load key base 2^32
393         ldr     r5,[r0,#24]
394         ldr     r6,[r0,#28]
395         ldr     r7,[r0,#32]
396
397         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
398         mov     r3,r4,lsr#26
399         mov     r4,r5,lsr#20
400         orr     r3,r3,r5,lsl#6
401         mov     r5,r6,lsr#14
402         orr     r4,r4,r6,lsl#12
403         mov     r6,r7,lsr#8
404         orr     r5,r5,r7,lsl#18
405         and     r3,r3,#0x03ffffff
406         and     r4,r4,#0x03ffffff
407         and     r5,r5,#0x03ffffff
408
409         vdup.32 d0,r2                   @ r^1 in both lanes
410         add     r2,r3,r3,lsl#2          @ *5
411         vdup.32 d1,r3
412         add     r3,r4,r4,lsl#2
413         vdup.32 d2,r2
414         vdup.32 d3,r4
415         add     r4,r5,r5,lsl#2
416         vdup.32 d4,r3
417         vdup.32 d5,r5
418         add     r5,r6,r6,lsl#2
419         vdup.32 d6,r4
420         vdup.32 d7,r6
421         vdup.32 d8,r5
422
423         mov     r5,#2           @ counter
424
425 .Lsquare_neon:
426         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
427         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
428         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
429         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
430         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
431         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
432
433         vmull.u32       q5,d0,d0[1]
434         vmull.u32       q6,d1,d0[1]
435         vmull.u32       q7,d3,d0[1]
436         vmull.u32       q8,d5,d0[1]
437         vmull.u32       q9,d7,d0[1]
438
439         vmlal.u32       q5,d7,d2[1]
440         vmlal.u32       q6,d0,d1[1]
441         vmlal.u32       q7,d1,d1[1]
442         vmlal.u32       q8,d3,d1[1]
443         vmlal.u32       q9,d5,d1[1]
444
445         vmlal.u32       q5,d5,d4[1]
446         vmlal.u32       q6,d7,d4[1]
447         vmlal.u32       q8,d1,d3[1]
448         vmlal.u32       q7,d0,d3[1]
449         vmlal.u32       q9,d3,d3[1]
450
451         vmlal.u32       q5,d3,d6[1]
452         vmlal.u32       q8,d0,d5[1]
453         vmlal.u32       q6,d5,d6[1]
454         vmlal.u32       q7,d7,d6[1]
455         vmlal.u32       q9,d1,d5[1]
456
457         vmlal.u32       q8,d7,d8[1]
458         vmlal.u32       q5,d1,d8[1]
459         vmlal.u32       q6,d3,d8[1]
460         vmlal.u32       q7,d5,d8[1]
461         vmlal.u32       q9,d0,d7[1]
462
463         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
464         @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
465         @ and P. Schwabe
466         @
467         @ H0>>+H1>>+H2>>+H3>>+H4
468         @ H3>>+H4>>*5+H0>>+H1
469         @
470         @ Trivia.
471         @
472         @ Result of multiplication of n-bit number by m-bit number is
473         @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
474         @ m-bit number multiplied by 2^n is still n+m bits wide.
475         @
476         @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
477         @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
478         @ one is n+1 bits wide.
479         @
480         @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
481         @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
482         @ can be 27. However! In cases when their width exceeds 26 bits
483         @ they are limited by 2^26+2^6. This in turn means that *sum*
484         @ of the products with these values can still be viewed as sum
485         @ of 52-bit numbers as long as the amount of addends is not a
486         @ power of 2. For example,
487         @
488         @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
489         @
490         @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
491         @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
492         @ 8 * (2^52) or 2^55. However, the value is then multiplied by
493         @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
494         @ which is less than 32 * (2^52) or 2^57. And when processing
495         @ data we are looking at triple as many addends...
496         @
497         @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
498         @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
499         @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
500         @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
501         @ instruction accepts 2x32-bit input and writes 2x64-bit result.
502         @ This means that result of reduction have to be compressed upon
503         @ loop wrap-around. This can be done in the process of reduction
504         @ to minimize amount of instructions [as well as amount of
505         @ 128-bit instructions, which benefits low-end processors], but
506         @ one has to watch for H2 (which is narrower than H0) and 5*H4
507         @ not being wider than 58 bits, so that result of right shift
508         @ by 26 bits fits in 32 bits. This is also useful on x86,
509         @ because it allows to use paddd in place for paddq, which
510         @ benefits Atom, where paddq is ridiculously slow.
511
512         vshr.u64        q15,q8,#26
513         vmovn.i64       d16,q8
514         vshr.u64        q4,q5,#26
515         vmovn.i64       d10,q5
516         vadd.i64        q9,q9,q15               @ h3 -> h4
517         vbic.i32        d16,#0xfc000000 @ &=0x03ffffff
518         vadd.i64        q6,q6,q4                @ h0 -> h1
519         vbic.i32        d10,#0xfc000000
520
521         vshrn.u64       d30,q9,#26
522         vmovn.i64       d18,q9
523         vshr.u64        q4,q6,#26
524         vmovn.i64       d12,q6
525         vadd.i64        q7,q7,q4                @ h1 -> h2
526         vbic.i32        d18,#0xfc000000
527         vbic.i32        d12,#0xfc000000
528
529         vadd.i32        d10,d10,d30
530         vshl.u32        d30,d30,#2
531         vshrn.u64       d8,q7,#26
532         vmovn.i64       d14,q7
533         vadd.i32        d10,d10,d30     @ h4 -> h0
534         vadd.i32        d16,d16,d8      @ h2 -> h3
535         vbic.i32        d14,#0xfc000000
536
537         vshr.u32        d30,d10,#26
538         vbic.i32        d10,#0xfc000000
539         vshr.u32        d8,d16,#26
540         vbic.i32        d16,#0xfc000000
541         vadd.i32        d12,d12,d30     @ h0 -> h1
542         vadd.i32        d18,d18,d8      @ h3 -> h4
543
544         subs    r5,r5,#1
545         beq     .Lsquare_break_neon
546
547         add     r6,r0,#(48+0*9*4)
548         add     r7,r0,#(48+1*9*4)
549
550         vtrn.32 d0,d10          @ r^2:r^1
551         vtrn.32 d3,d14
552         vtrn.32 d5,d16
553         vtrn.32 d1,d12
554         vtrn.32 d7,d18
555
556         vshl.u32        d4,d3,#2                @ *5
557         vshl.u32        d6,d5,#2
558         vshl.u32        d2,d1,#2
559         vshl.u32        d8,d7,#2
560         vadd.i32        d4,d4,d3
561         vadd.i32        d2,d2,d1
562         vadd.i32        d6,d6,d5
563         vadd.i32        d8,d8,d7
564
565         vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
566         vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
567         vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
568         vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
569         vst1.32 {d8[0]},[r6,:32]
570         vst1.32 {d8[1]},[r7,:32]
571
572         b       .Lsquare_neon
573
574 .align  4
575 .Lsquare_break_neon:
576         add     r6,r0,#(48+2*4*9)
577         add     r7,r0,#(48+3*4*9)
578
579         vmov    d0,d10          @ r^4:r^3
580         vshl.u32        d2,d12,#2               @ *5
581         vmov    d1,d12
582         vshl.u32        d4,d14,#2
583         vmov    d3,d14
584         vshl.u32        d6,d16,#2
585         vmov    d5,d16
586         vshl.u32        d8,d18,#2
587         vmov    d7,d18
588         vadd.i32        d2,d2,d12
589         vadd.i32        d4,d4,d14
590         vadd.i32        d6,d6,d16
591         vadd.i32        d8,d8,d18
592
593         vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
594         vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
595         vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
596         vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
597         vst1.32 {d8[0]},[r6]
598         vst1.32 {d8[1]},[r7]
599
600         bx      lr                              @ bx    lr
601 .size   poly1305_init_neon,.-poly1305_init_neon
602
603 .type   poly1305_blocks_neon,%function
604 .align  5
605 poly1305_blocks_neon:
606 .Lpoly1305_blocks_neon:
607         ldr     ip,[r0,#36]             @ is_base2_26
608         ands    r2,r2,#-16
609         beq     .Lno_data_neon
610
611         cmp     r2,#64
612         bhs     .Lenter_neon
613         tst     ip,ip                   @ is_base2_26?
614         beq     .Lpoly1305_blocks
615
616 .Lenter_neon:
617         stmdb   sp!,{r4,r5,r6,r7}
618         vstmdb  sp!,{d8,d9,d10,d11,d12,d13,d14,d15}             @ ABI specification says so
619
620         tst     ip,ip                   @ is_base2_26?
621         bne     .Lbase2_26_neon
622
623         stmdb   sp!,{r1,r2,r3,lr}
624         bl      poly1305_init_neon
625
626         ldr     r4,[r0,#0]              @ load hash value base 2^32
627         ldr     r5,[r0,#4]
628         ldr     r6,[r0,#8]
629         ldr     r7,[r0,#12]
630         ldr     ip,[r0,#16]
631
632         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
633         mov     r3,r4,lsr#26
634         veor    d10,d10,d10
635         mov     r4,r5,lsr#20
636         orr     r3,r3,r5,lsl#6
637         veor    d12,d12,d12
638         mov     r5,r6,lsr#14
639         orr     r4,r4,r6,lsl#12
640         veor    d14,d14,d14
641         mov     r6,r7,lsr#8
642         orr     r5,r5,r7,lsl#18
643         veor    d16,d16,d16
644         and     r3,r3,#0x03ffffff
645         orr     r6,r6,ip,lsl#24
646         veor    d18,d18,d18
647         and     r4,r4,#0x03ffffff
648         mov     r1,#1
649         and     r5,r5,#0x03ffffff
650         str     r1,[r0,#36]             @ is_base2_26
651
652         vmov.32 d10[0],r2
653         vmov.32 d12[0],r3
654         vmov.32 d14[0],r4
655         vmov.32 d16[0],r5
656         vmov.32 d18[0],r6
657         adr     r5,.Lzeros
658
659         ldmia   sp!,{r1,r2,r3,lr}
660         b       .Lbase2_32_neon
661
662 .align  4
663 .Lbase2_26_neon:
664         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
665         @ load hash value
666
667         veor    d10,d10,d10
668         veor    d12,d12,d12
669         veor    d14,d14,d14
670         veor    d16,d16,d16
671         veor    d18,d18,d18
672         vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
673         adr     r5,.Lzeros
674         vld1.32 {d18[0]},[r0]
675         sub     r0,r0,#16               @ rewind
676
677 .Lbase2_32_neon:
678         add     r4,r1,#32
679         mov     r3,r3,lsl#24
680         tst     r2,#31
681         beq     .Leven
682
683         vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
684         vmov.32 d28[0],r3
685         sub     r2,r2,#16
686         add     r4,r1,#32
687
688 # ifdef __ARMEB__
689         vrev32.8        q10,q10
690         vrev32.8        q13,q13
691         vrev32.8        q11,q11
692         vrev32.8        q12,q12
693 # endif
694         vsri.u32        d28,d26,#8      @ base 2^32 -> base 2^26
695         vshl.u32        d26,d26,#18
696
697         vsri.u32        d26,d24,#14
698         vshl.u32        d24,d24,#12
699         vadd.i32        d29,d28,d18     @ add hash value and move to #hi
700
701         vbic.i32        d26,#0xfc000000
702         vsri.u32        d24,d22,#20
703         vshl.u32        d22,d22,#6
704
705         vbic.i32        d24,#0xfc000000
706         vsri.u32        d22,d20,#26
707         vadd.i32        d27,d26,d16
708
709         vbic.i32        d20,#0xfc000000
710         vbic.i32        d22,#0xfc000000
711         vadd.i32        d25,d24,d14
712
713         vadd.i32        d21,d20,d10
714         vadd.i32        d23,d22,d12
715
716         mov     r7,r5
717         add     r6,r0,#48
718
719         cmp     r2,r2
720         b       .Long_tail
721
722 .align  4
723 .Leven:
724         subs    r2,r2,#64
725         it      lo
726         movlo   r4,r5
727
728         vmov.i32        q14,#1<<24              @ padbit, yes, always
729         vld4.32 {d20,d22,d24,d26},[r1]  @ inp[0:1]
730         add     r1,r1,#64
731         vld4.32 {d21,d23,d25,d27},[r4]  @ inp[2:3] (or 0)
732         add     r4,r4,#64
733         itt     hi
734         addhi   r7,r0,#(48+1*9*4)
735         addhi   r6,r0,#(48+3*9*4)
736
737 # ifdef __ARMEB__
738         vrev32.8        q10,q10
739         vrev32.8        q13,q13
740         vrev32.8        q11,q11
741         vrev32.8        q12,q12
742 # endif
743         vsri.u32        q14,q13,#8              @ base 2^32 -> base 2^26
744         vshl.u32        q13,q13,#18
745
746         vsri.u32        q13,q12,#14
747         vshl.u32        q12,q12,#12
748
749         vbic.i32        q13,#0xfc000000
750         vsri.u32        q12,q11,#20
751         vshl.u32        q11,q11,#6
752
753         vbic.i32        q12,#0xfc000000
754         vsri.u32        q11,q10,#26
755
756         vbic.i32        q10,#0xfc000000
757         vbic.i32        q11,#0xfc000000
758
759         bls     .Lskip_loop
760
761         vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
762         vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
763         vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
764         vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
765         b       .Loop_neon
766
767 .align  5
768 .Loop_neon:
769         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
770         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
771         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
772         @   ___________________/
773         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
774         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
775         @   ___________________/ ____________________/
776         @
777         @ Note that we start with inp[2:3]*r^2. This is because it
778         @ doesn't depend on reduction in previous iteration.
779         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
780         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
781         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
782         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
783         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
784         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
785
786         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
787         @ inp[2:3]*r^2
788
789         vadd.i32        d24,d24,d14     @ accumulate inp[0:1]
790         vmull.u32       q7,d25,d0[1]
791         vadd.i32        d20,d20,d10
792         vmull.u32       q5,d21,d0[1]
793         vadd.i32        d26,d26,d16
794         vmull.u32       q8,d27,d0[1]
795         vmlal.u32       q7,d23,d1[1]
796         vadd.i32        d22,d22,d12
797         vmull.u32       q6,d23,d0[1]
798
799         vadd.i32        d28,d28,d18
800         vmull.u32       q9,d29,d0[1]
801         subs    r2,r2,#64
802         vmlal.u32       q5,d29,d2[1]
803         it      lo
804         movlo   r4,r5
805         vmlal.u32       q8,d25,d1[1]
806         vld1.32 d8[1],[r7,:32]
807         vmlal.u32       q6,d21,d1[1]
808         vmlal.u32       q9,d27,d1[1]
809
810         vmlal.u32       q5,d27,d4[1]
811         vmlal.u32       q8,d23,d3[1]
812         vmlal.u32       q9,d25,d3[1]
813         vmlal.u32       q6,d29,d4[1]
814         vmlal.u32       q7,d21,d3[1]
815
816         vmlal.u32       q8,d21,d5[1]
817         vmlal.u32       q5,d25,d6[1]
818         vmlal.u32       q9,d23,d5[1]
819         vmlal.u32       q6,d27,d6[1]
820         vmlal.u32       q7,d29,d6[1]
821
822         vmlal.u32       q8,d29,d8[1]
823         vmlal.u32       q5,d23,d8[1]
824         vmlal.u32       q9,d21,d7[1]
825         vmlal.u32       q6,d25,d8[1]
826         vmlal.u32       q7,d27,d8[1]
827
828         vld4.32 {d21,d23,d25,d27},[r4]  @ inp[2:3] (or 0)
829         add     r4,r4,#64
830
831         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
832         @ (hash+inp[0:1])*r^4 and accumulate
833
834         vmlal.u32       q8,d26,d0[0]
835         vmlal.u32       q5,d20,d0[0]
836         vmlal.u32       q9,d28,d0[0]
837         vmlal.u32       q6,d22,d0[0]
838         vmlal.u32       q7,d24,d0[0]
839         vld1.32 d8[0],[r6,:32]
840
841         vmlal.u32       q8,d24,d1[0]
842         vmlal.u32       q5,d28,d2[0]
843         vmlal.u32       q9,d26,d1[0]
844         vmlal.u32       q6,d20,d1[0]
845         vmlal.u32       q7,d22,d1[0]
846
847         vmlal.u32       q8,d22,d3[0]
848         vmlal.u32       q5,d26,d4[0]
849         vmlal.u32       q9,d24,d3[0]
850         vmlal.u32       q6,d28,d4[0]
851         vmlal.u32       q7,d20,d3[0]
852
853         vmlal.u32       q8,d20,d5[0]
854         vmlal.u32       q5,d24,d6[0]
855         vmlal.u32       q9,d22,d5[0]
856         vmlal.u32       q6,d26,d6[0]
857         vmlal.u32       q8,d28,d8[0]
858
859         vmlal.u32       q7,d28,d6[0]
860         vmlal.u32       q5,d22,d8[0]
861         vmlal.u32       q9,d20,d7[0]
862         vmov.i32        q14,#1<<24              @ padbit, yes, always
863         vmlal.u32       q6,d24,d8[0]
864         vmlal.u32       q7,d26,d8[0]
865
866         vld4.32 {d20,d22,d24,d26},[r1]  @ inp[0:1]
867         add     r1,r1,#64
868 # ifdef __ARMEB__
869         vrev32.8        q10,q10
870         vrev32.8        q11,q11
871         vrev32.8        q12,q12
872         vrev32.8        q13,q13
873 # endif
874
875         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
876         @ lazy reduction interleaved with base 2^32 -> base 2^26 of
877         @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
878
879         vshr.u64        q15,q8,#26
880         vmovn.i64       d16,q8
881         vshr.u64        q4,q5,#26
882         vmovn.i64       d10,q5
883         vadd.i64        q9,q9,q15               @ h3 -> h4
884         vbic.i32        d16,#0xfc000000
885         vsri.u32        q14,q13,#8              @ base 2^32 -> base 2^26
886         vadd.i64        q6,q6,q4                @ h0 -> h1
887         vshl.u32        q13,q13,#18
888         vbic.i32        d10,#0xfc000000
889
890         vshrn.u64       d30,q9,#26
891         vmovn.i64       d18,q9
892         vshr.u64        q4,q6,#26
893         vmovn.i64       d12,q6
894         vadd.i64        q7,q7,q4                @ h1 -> h2
895         vsri.u32        q13,q12,#14
896         vbic.i32        d18,#0xfc000000
897         vshl.u32        q12,q12,#12
898         vbic.i32        d12,#0xfc000000
899
900         vadd.i32        d10,d10,d30
901         vshl.u32        d30,d30,#2
902         vbic.i32        q13,#0xfc000000
903         vshrn.u64       d8,q7,#26
904         vmovn.i64       d14,q7
905         vaddl.u32       q5,d10,d30      @ h4 -> h0 [widen for a sec]
906         vsri.u32        q12,q11,#20
907         vadd.i32        d16,d16,d8      @ h2 -> h3
908         vshl.u32        q11,q11,#6
909         vbic.i32        d14,#0xfc000000
910         vbic.i32        q12,#0xfc000000
911
912         vshrn.u64       d30,q5,#26              @ re-narrow
913         vmovn.i64       d10,q5
914         vsri.u32        q11,q10,#26
915         vbic.i32        q10,#0xfc000000
916         vshr.u32        d8,d16,#26
917         vbic.i32        d16,#0xfc000000
918         vbic.i32        d10,#0xfc000000
919         vadd.i32        d12,d12,d30     @ h0 -> h1
920         vadd.i32        d18,d18,d8      @ h3 -> h4
921         vbic.i32        q11,#0xfc000000
922
923         bhi     .Loop_neon
924
925 .Lskip_loop:
926         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
927         @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
928
929         add     r7,r0,#(48+0*9*4)
930         add     r6,r0,#(48+1*9*4)
931         adds    r2,r2,#32
932         it      ne
933         movne   r2,#0
934         bne     .Long_tail
935
936         vadd.i32        d25,d24,d14     @ add hash value and move to #hi
937         vadd.i32        d21,d20,d10
938         vadd.i32        d27,d26,d16
939         vadd.i32        d23,d22,d12
940         vadd.i32        d29,d28,d18
941
942 .Long_tail:
943         vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
944         vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
945
946         vadd.i32        d24,d24,d14     @ can be redundant
947         vmull.u32       q7,d25,d0
948         vadd.i32        d20,d20,d10
949         vmull.u32       q5,d21,d0
950         vadd.i32        d26,d26,d16
951         vmull.u32       q8,d27,d0
952         vadd.i32        d22,d22,d12
953         vmull.u32       q6,d23,d0
954         vadd.i32        d28,d28,d18
955         vmull.u32       q9,d29,d0
956
957         vmlal.u32       q5,d29,d2
958         vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
959         vmlal.u32       q8,d25,d1
960         vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
961         vmlal.u32       q6,d21,d1
962         vmlal.u32       q9,d27,d1
963         vmlal.u32       q7,d23,d1
964
965         vmlal.u32       q8,d23,d3
966         vld1.32 d8[1],[r7,:32]
967         vmlal.u32       q5,d27,d4
968         vld1.32 d8[0],[r6,:32]
969         vmlal.u32       q9,d25,d3
970         vmlal.u32       q6,d29,d4
971         vmlal.u32       q7,d21,d3
972
973         vmlal.u32       q8,d21,d5
974         it      ne
975         addne   r7,r0,#(48+2*9*4)
976         vmlal.u32       q5,d25,d6
977         it      ne
978         addne   r6,r0,#(48+3*9*4)
979         vmlal.u32       q9,d23,d5
980         vmlal.u32       q6,d27,d6
981         vmlal.u32       q7,d29,d6
982
983         vmlal.u32       q8,d29,d8
984         vorn    q0,q0,q0        @ all-ones, can be redundant
985         vmlal.u32       q5,d23,d8
986         vshr.u64        q0,q0,#38
987         vmlal.u32       q9,d21,d7
988         vmlal.u32       q6,d25,d8
989         vmlal.u32       q7,d27,d8
990
991         beq     .Lshort_tail
992
993         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
994         @ (hash+inp[0:1])*r^4:r^3 and accumulate
995
996         vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
997         vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
998
999         vmlal.u32       q7,d24,d0
1000         vmlal.u32       q5,d20,d0
1001         vmlal.u32       q8,d26,d0
1002         vmlal.u32       q6,d22,d0
1003         vmlal.u32       q9,d28,d0
1004
1005         vmlal.u32       q5,d28,d2
1006         vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1007         vmlal.u32       q8,d24,d1
1008         vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1009         vmlal.u32       q6,d20,d1
1010         vmlal.u32       q9,d26,d1
1011         vmlal.u32       q7,d22,d1
1012
1013         vmlal.u32       q8,d22,d3
1014         vld1.32 d8[1],[r7,:32]
1015         vmlal.u32       q5,d26,d4
1016         vld1.32 d8[0],[r6,:32]
1017         vmlal.u32       q9,d24,d3
1018         vmlal.u32       q6,d28,d4
1019         vmlal.u32       q7,d20,d3
1020
1021         vmlal.u32       q8,d20,d5
1022         vmlal.u32       q5,d24,d6
1023         vmlal.u32       q9,d22,d5
1024         vmlal.u32       q6,d26,d6
1025         vmlal.u32       q7,d28,d6
1026
1027         vmlal.u32       q8,d28,d8
1028         vorn    q0,q0,q0        @ all-ones
1029         vmlal.u32       q5,d22,d8
1030         vshr.u64        q0,q0,#38
1031         vmlal.u32       q9,d20,d7
1032         vmlal.u32       q6,d24,d8
1033         vmlal.u32       q7,d26,d8
1034
1035 .Lshort_tail:
1036         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1037         @ horizontal addition
1038
1039         vadd.i64        d16,d16,d17
1040         vadd.i64        d10,d10,d11
1041         vadd.i64        d18,d18,d19
1042         vadd.i64        d12,d12,d13
1043         vadd.i64        d14,d14,d15
1044
1045         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1046         @ lazy reduction, but without narrowing
1047
1048         vshr.u64        q15,q8,#26
1049         vand.i64        q8,q8,q0
1050         vshr.u64        q4,q5,#26
1051         vand.i64        q5,q5,q0
1052         vadd.i64        q9,q9,q15               @ h3 -> h4
1053         vadd.i64        q6,q6,q4                @ h0 -> h1
1054
1055         vshr.u64        q15,q9,#26
1056         vand.i64        q9,q9,q0
1057         vshr.u64        q4,q6,#26
1058         vand.i64        q6,q6,q0
1059         vadd.i64        q7,q7,q4                @ h1 -> h2
1060
1061         vadd.i64        q5,q5,q15
1062         vshl.u64        q15,q15,#2
1063         vshr.u64        q4,q7,#26
1064         vand.i64        q7,q7,q0
1065         vadd.i64        q5,q5,q15               @ h4 -> h0
1066         vadd.i64        q8,q8,q4                @ h2 -> h3
1067
1068         vshr.u64        q15,q5,#26
1069         vand.i64        q5,q5,q0
1070         vshr.u64        q4,q8,#26
1071         vand.i64        q8,q8,q0
1072         vadd.i64        q6,q6,q15               @ h0 -> h1
1073         vadd.i64        q9,q9,q4                @ h3 -> h4
1074
1075         cmp     r2,#0
1076         bne     .Leven
1077
1078         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1079         @ store hash value
1080
1081         vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
1082         vst1.32 {d18[0]},[r0]
1083
1084         vldmia  sp!,{d8,d9,d10,d11,d12,d13,d14,d15}                     @ epilogue
1085         ldmia   sp!,{r4,r5,r6,r7}
1086 .Lno_data_neon:
1087         bx      lr                                      @ bx    lr
1088 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
1089
1090 .type   poly1305_emit_neon,%function
1091 .align  5
1092 poly1305_emit_neon:
1093 .Lpoly1305_emit_neon:
1094         ldr     ip,[r0,#36]             @ is_base2_26
1095
1096         stmdb   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1097
1098         tst     ip,ip
1099         beq     .Lpoly1305_emit_enter
1100
1101         ldmia   r0,{r3,r4,r5,r6,r7}
1102         eor     r8,r8,r8
1103
1104         adds    r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
1105         mov     r4,r4,lsr#6
1106         adcs    r4,r4,r5,lsl#20
1107         mov     r5,r5,lsr#12
1108         adcs    r5,r5,r6,lsl#14
1109         mov     r6,r6,lsr#18
1110         adcs    r6,r6,r7,lsl#8
1111         adc     r7,r8,r7,lsr#24 @ can be partially reduced ...
1112
1113         and     r8,r7,#-4               @ ... so reduce
1114         and     r7,r6,#3
1115         add     r8,r8,r8,lsr#2  @ *= 5
1116         adds    r3,r3,r8
1117         adcs    r4,r4,#0
1118         adcs    r5,r5,#0
1119         adcs    r6,r6,#0
1120         adc     r7,r7,#0
1121
1122         adds    r8,r3,#5                @ compare to modulus
1123         adcs    r9,r4,#0
1124         adcs    r10,r5,#0
1125         adcs    r11,r6,#0
1126         adc     r7,r7,#0
1127         tst     r7,#4                   @ did it carry/borrow?
1128
1129         it      ne
1130         movne   r3,r8
1131         ldr     r8,[r2,#0]
1132         it      ne
1133         movne   r4,r9
1134         ldr     r9,[r2,#4]
1135         it      ne
1136         movne   r5,r10
1137         ldr     r10,[r2,#8]
1138         it      ne
1139         movne   r6,r11
1140         ldr     r11,[r2,#12]
1141
1142         adds    r3,r3,r8                @ accumulate nonce
1143         adcs    r4,r4,r9
1144         adcs    r5,r5,r10
1145         adc     r6,r6,r11
1146
1147 # ifdef __ARMEB__
1148         rev     r3,r3
1149         rev     r4,r4
1150         rev     r5,r5
1151         rev     r6,r6
1152 # endif
1153         str     r3,[r1,#0]              @ store the result
1154         str     r4,[r1,#4]
1155         str     r5,[r1,#8]
1156         str     r6,[r1,#12]
1157
1158         ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1159         bx      lr                              @ bx    lr
1160 .size   poly1305_emit_neon,.-poly1305_emit_neon
1161
1162 .align  5
1163 .Lzeros:
1164 .long   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1165 .LOPENSSL_armcap:
1166 .word   OPENSSL_armcap_P-.Lpoly1305_init
1167 #endif
1168 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1169 .align  2
1170 .align  2
1171 #if     __ARM_MAX_ARCH__>=7
1172 .comm   OPENSSL_armcap_P,4,4
1173 #endif