]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/openssl/aarch64/poly1305-armv8.S
Merge OpenSSL 1.1.1i.
[FreeBSD/FreeBSD.git] / sys / crypto / openssl / aarch64 / poly1305-armv8.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
3 #include "arm_arch.h"
4
5 .text
6
7 // forward "declarations" are required for Apple
8
9 .hidden OPENSSL_armcap_P
10 .globl  poly1305_init
11 .hidden poly1305_init
12 .globl  poly1305_blocks
13 .hidden poly1305_blocks
14 .globl  poly1305_emit
15 .hidden poly1305_emit
16
17 .type   poly1305_init,%function
18 .align  5
19 poly1305_init:
20         cmp     x1,xzr
21         stp     xzr,xzr,[x0]            // zero hash value
22         stp     xzr,xzr,[x0,#16]        // [along with is_base2_26]
23
24         csel    x0,xzr,x0,eq
25         b.eq    .Lno_key
26
27 #ifdef  __ILP32__
28         ldrsw   x11,.LOPENSSL_armcap_P
29 #else
30         ldr     x11,.LOPENSSL_armcap_P
31 #endif
32         adr     x10,.LOPENSSL_armcap_P
33
34         ldp     x7,x8,[x1]              // load key
35         mov     x9,#0xfffffffc0fffffff
36         movk    x9,#0x0fff,lsl#48
37         ldr     w17,[x10,x11]
38 #ifdef  __ARMEB__
39         rev     x7,x7                   // flip bytes
40         rev     x8,x8
41 #endif
42         and     x7,x7,x9                // &=0ffffffc0fffffff
43         and     x9,x9,#-4
44         and     x8,x8,x9                // &=0ffffffc0ffffffc
45         stp     x7,x8,[x0,#32]  // save key value
46
47         tst     w17,#ARMV7_NEON
48
49         adr     x12,poly1305_blocks
50         adr     x7,poly1305_blocks_neon
51         adr     x13,poly1305_emit
52         adr     x8,poly1305_emit_neon
53
54         csel    x12,x12,x7,eq
55         csel    x13,x13,x8,eq
56
57 #ifdef  __ILP32__
58         stp     w12,w13,[x2]
59 #else
60         stp     x12,x13,[x2]
61 #endif
62
63         mov     x0,#1
64 .Lno_key:
65         ret
66 .size   poly1305_init,.-poly1305_init
67
68 .type   poly1305_blocks,%function
69 .align  5
70 poly1305_blocks:
71         ands    x2,x2,#-16
72         b.eq    .Lno_data
73
74         ldp     x4,x5,[x0]              // load hash value
75         ldp     x7,x8,[x0,#32]  // load key value
76         ldr     x6,[x0,#16]
77         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
78         b       .Loop
79
80 .align  5
81 .Loop:
82         ldp     x10,x11,[x1],#16        // load input
83         sub     x2,x2,#16
84 #ifdef  __ARMEB__
85         rev     x10,x10
86         rev     x11,x11
87 #endif
88         adds    x4,x4,x10               // accumulate input
89         adcs    x5,x5,x11
90
91         mul     x12,x4,x7               // h0*r0
92         adc     x6,x6,x3
93         umulh   x13,x4,x7
94
95         mul     x10,x5,x9               // h1*5*r1
96         umulh   x11,x5,x9
97
98         adds    x12,x12,x10
99         mul     x10,x4,x8               // h0*r1
100         adc     x13,x13,x11
101         umulh   x14,x4,x8
102
103         adds    x13,x13,x10
104         mul     x10,x5,x7               // h1*r0
105         adc     x14,x14,xzr
106         umulh   x11,x5,x7
107
108         adds    x13,x13,x10
109         mul     x10,x6,x9               // h2*5*r1
110         adc     x14,x14,x11
111         mul     x11,x6,x7               // h2*r0
112
113         adds    x13,x13,x10
114         adc     x14,x14,x11
115
116         and     x10,x14,#-4             // final reduction
117         and     x6,x14,#3
118         add     x10,x10,x14,lsr#2
119         adds    x4,x12,x10
120         adcs    x5,x13,xzr
121         adc     x6,x6,xzr
122
123         cbnz    x2,.Loop
124
125         stp     x4,x5,[x0]              // store hash value
126         str     x6,[x0,#16]
127
128 .Lno_data:
129         ret
130 .size   poly1305_blocks,.-poly1305_blocks
131
132 .type   poly1305_emit,%function
133 .align  5
134 poly1305_emit:
135         ldp     x4,x5,[x0]              // load hash base 2^64
136         ldr     x6,[x0,#16]
137         ldp     x10,x11,[x2]    // load nonce
138
139         adds    x12,x4,#5               // compare to modulus
140         adcs    x13,x5,xzr
141         adc     x14,x6,xzr
142
143         tst     x14,#-4                 // see if it's carried/borrowed
144
145         csel    x4,x4,x12,eq
146         csel    x5,x5,x13,eq
147
148 #ifdef  __ARMEB__
149         ror     x10,x10,#32             // flip nonce words
150         ror     x11,x11,#32
151 #endif
152         adds    x4,x4,x10               // accumulate nonce
153         adc     x5,x5,x11
154 #ifdef  __ARMEB__
155         rev     x4,x4                   // flip output bytes
156         rev     x5,x5
157 #endif
158         stp     x4,x5,[x1]              // write result
159
160         ret
161 .size   poly1305_emit,.-poly1305_emit
162 .type   poly1305_mult,%function
163 .align  5
164 poly1305_mult:
165         mul     x12,x4,x7               // h0*r0
166         umulh   x13,x4,x7
167
168         mul     x10,x5,x9               // h1*5*r1
169         umulh   x11,x5,x9
170
171         adds    x12,x12,x10
172         mul     x10,x4,x8               // h0*r1
173         adc     x13,x13,x11
174         umulh   x14,x4,x8
175
176         adds    x13,x13,x10
177         mul     x10,x5,x7               // h1*r0
178         adc     x14,x14,xzr
179         umulh   x11,x5,x7
180
181         adds    x13,x13,x10
182         mul     x10,x6,x9               // h2*5*r1
183         adc     x14,x14,x11
184         mul     x11,x6,x7               // h2*r0
185
186         adds    x13,x13,x10
187         adc     x14,x14,x11
188
189         and     x10,x14,#-4             // final reduction
190         and     x6,x14,#3
191         add     x10,x10,x14,lsr#2
192         adds    x4,x12,x10
193         adcs    x5,x13,xzr
194         adc     x6,x6,xzr
195
196         ret
197 .size   poly1305_mult,.-poly1305_mult
198
199 .type   poly1305_splat,%function
200 .align  5
201 poly1305_splat:
202         and     x12,x4,#0x03ffffff      // base 2^64 -> base 2^26
203         ubfx    x13,x4,#26,#26
204         extr    x14,x5,x4,#52
205         and     x14,x14,#0x03ffffff
206         ubfx    x15,x5,#14,#26
207         extr    x16,x6,x5,#40
208
209         str     w12,[x0,#16*0]  // r0
210         add     w12,w13,w13,lsl#2       // r1*5
211         str     w13,[x0,#16*1]  // r1
212         add     w13,w14,w14,lsl#2       // r2*5
213         str     w12,[x0,#16*2]  // s1
214         str     w14,[x0,#16*3]  // r2
215         add     w14,w15,w15,lsl#2       // r3*5
216         str     w13,[x0,#16*4]  // s2
217         str     w15,[x0,#16*5]  // r3
218         add     w15,w16,w16,lsl#2       // r4*5
219         str     w14,[x0,#16*6]  // s3
220         str     w16,[x0,#16*7]  // r4
221         str     w15,[x0,#16*8]  // s4
222
223         ret
224 .size   poly1305_splat,.-poly1305_splat
225
226 .type   poly1305_blocks_neon,%function
227 .align  5
228 poly1305_blocks_neon:
229         ldr     x17,[x0,#24]
230         cmp     x2,#128
231         b.hs    .Lblocks_neon
232         cbz     x17,poly1305_blocks
233
234 .Lblocks_neon:
235 .inst   0xd503233f              // paciasp
236         stp     x29,x30,[sp,#-80]!
237         add     x29,sp,#0
238
239         ands    x2,x2,#-16
240         b.eq    .Lno_data_neon
241
242         cbz     x17,.Lbase2_64_neon
243
244         ldp     w10,w11,[x0]            // load hash value base 2^26
245         ldp     w12,w13,[x0,#8]
246         ldr     w14,[x0,#16]
247
248         tst     x2,#31
249         b.eq    .Leven_neon
250
251         ldp     x7,x8,[x0,#32]  // load key value
252
253         add     x4,x10,x11,lsl#26       // base 2^26 -> base 2^64
254         lsr     x5,x12,#12
255         adds    x4,x4,x12,lsl#52
256         add     x5,x5,x13,lsl#14
257         adc     x5,x5,xzr
258         lsr     x6,x14,#24
259         adds    x5,x5,x14,lsl#40
260         adc     x14,x6,xzr              // can be partially reduced...
261
262         ldp     x12,x13,[x1],#16        // load input
263         sub     x2,x2,#16
264         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
265
266         and     x10,x14,#-4             // ... so reduce
267         and     x6,x14,#3
268         add     x10,x10,x14,lsr#2
269         adds    x4,x4,x10
270         adcs    x5,x5,xzr
271         adc     x6,x6,xzr
272
273 #ifdef  __ARMEB__
274         rev     x12,x12
275         rev     x13,x13
276 #endif
277         adds    x4,x4,x12               // accumulate input
278         adcs    x5,x5,x13
279         adc     x6,x6,x3
280
281         bl      poly1305_mult
282         ldr     x30,[sp,#8]
283
284         cbz     x3,.Lstore_base2_64_neon
285
286         and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
287         ubfx    x11,x4,#26,#26
288         extr    x12,x5,x4,#52
289         and     x12,x12,#0x03ffffff
290         ubfx    x13,x5,#14,#26
291         extr    x14,x6,x5,#40
292
293         cbnz    x2,.Leven_neon
294
295         stp     w10,w11,[x0]            // store hash value base 2^26
296         stp     w12,w13,[x0,#8]
297         str     w14,[x0,#16]
298         b       .Lno_data_neon
299
300 .align  4
301 .Lstore_base2_64_neon:
302         stp     x4,x5,[x0]              // store hash value base 2^64
303         stp     x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
304         b       .Lno_data_neon
305
306 .align  4
307 .Lbase2_64_neon:
308         ldp     x7,x8,[x0,#32]  // load key value
309
310         ldp     x4,x5,[x0]              // load hash value base 2^64
311         ldr     x6,[x0,#16]
312
313         tst     x2,#31
314         b.eq    .Linit_neon
315
316         ldp     x12,x13,[x1],#16        // load input
317         sub     x2,x2,#16
318         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
319 #ifdef  __ARMEB__
320         rev     x12,x12
321         rev     x13,x13
322 #endif
323         adds    x4,x4,x12               // accumulate input
324         adcs    x5,x5,x13
325         adc     x6,x6,x3
326
327         bl      poly1305_mult
328
329 .Linit_neon:
330         and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
331         ubfx    x11,x4,#26,#26
332         extr    x12,x5,x4,#52
333         and     x12,x12,#0x03ffffff
334         ubfx    x13,x5,#14,#26
335         extr    x14,x6,x5,#40
336
337         stp     d8,d9,[sp,#16]          // meet ABI requirements
338         stp     d10,d11,[sp,#32]
339         stp     d12,d13,[sp,#48]
340         stp     d14,d15,[sp,#64]
341
342         fmov    d24,x10
343         fmov    d25,x11
344         fmov    d26,x12
345         fmov    d27,x13
346         fmov    d28,x14
347
348         ////////////////////////////////// initialize r^n table
349         mov     x4,x7                   // r^1
350         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
351         mov     x5,x8
352         mov     x6,xzr
353         add     x0,x0,#48+12
354         bl      poly1305_splat
355
356         bl      poly1305_mult           // r^2
357         sub     x0,x0,#4
358         bl      poly1305_splat
359
360         bl      poly1305_mult           // r^3
361         sub     x0,x0,#4
362         bl      poly1305_splat
363
364         bl      poly1305_mult           // r^4
365         sub     x0,x0,#4
366         bl      poly1305_splat
367         ldr     x30,[sp,#8]
368
369         add     x16,x1,#32
370         adr     x17,.Lzeros
371         subs    x2,x2,#64
372         csel    x16,x17,x16,lo
373
374         mov     x4,#1
375         str     x4,[x0,#-24]            // set is_base2_26
376         sub     x0,x0,#48               // restore original x0
377         b       .Ldo_neon
378
379 .align  4
380 .Leven_neon:
381         add     x16,x1,#32
382         adr     x17,.Lzeros
383         subs    x2,x2,#64
384         csel    x16,x17,x16,lo
385
386         stp     d8,d9,[sp,#16]          // meet ABI requirements
387         stp     d10,d11,[sp,#32]
388         stp     d12,d13,[sp,#48]
389         stp     d14,d15,[sp,#64]
390
391         fmov    d24,x10
392         fmov    d25,x11
393         fmov    d26,x12
394         fmov    d27,x13
395         fmov    d28,x14
396
397 .Ldo_neon:
398         ldp     x8,x12,[x16],#16        // inp[2:3] (or zero)
399         ldp     x9,x13,[x16],#48
400
401         lsl     x3,x3,#24
402         add     x15,x0,#48
403
404 #ifdef  __ARMEB__
405         rev     x8,x8
406         rev     x12,x12
407         rev     x9,x9
408         rev     x13,x13
409 #endif
410         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
411         and     x5,x9,#0x03ffffff
412         ubfx    x6,x8,#26,#26
413         ubfx    x7,x9,#26,#26
414         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
415         extr    x8,x12,x8,#52
416         extr    x9,x13,x9,#52
417         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
418         fmov    d14,x4
419         and     x8,x8,#0x03ffffff
420         and     x9,x9,#0x03ffffff
421         ubfx    x10,x12,#14,#26
422         ubfx    x11,x13,#14,#26
423         add     x12,x3,x12,lsr#40
424         add     x13,x3,x13,lsr#40
425         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
426         fmov    d15,x6
427         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
428         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
429         fmov    d16,x8
430         fmov    d17,x10
431         fmov    d18,x12
432
433         ldp     x8,x12,[x1],#16 // inp[0:1]
434         ldp     x9,x13,[x1],#48
435
436         ld1     {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
437         ld1     {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
438         ld1     {v8.4s},[x15]
439
440 #ifdef  __ARMEB__
441         rev     x8,x8
442         rev     x12,x12
443         rev     x9,x9
444         rev     x13,x13
445 #endif
446         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
447         and     x5,x9,#0x03ffffff
448         ubfx    x6,x8,#26,#26
449         ubfx    x7,x9,#26,#26
450         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
451         extr    x8,x12,x8,#52
452         extr    x9,x13,x9,#52
453         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
454         fmov    d9,x4
455         and     x8,x8,#0x03ffffff
456         and     x9,x9,#0x03ffffff
457         ubfx    x10,x12,#14,#26
458         ubfx    x11,x13,#14,#26
459         add     x12,x3,x12,lsr#40
460         add     x13,x3,x13,lsr#40
461         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
462         fmov    d10,x6
463         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
464         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
465         movi    v31.2d,#-1
466         fmov    d11,x8
467         fmov    d12,x10
468         fmov    d13,x12
469         ushr    v31.2d,v31.2d,#38
470
471         b.ls    .Lskip_loop
472
473 .align  4
474 .Loop_neon:
475         ////////////////////////////////////////////////////////////////
476         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
477         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
478         //   ___________________/
479         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
480         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
481         //   ___________________/ ____________________/
482         //
483         // Note that we start with inp[2:3]*r^2. This is because it
484         // doesn't depend on reduction in previous iteration.
485         ////////////////////////////////////////////////////////////////
486         // d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
487         // d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
488         // d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
489         // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
490         // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
491
492         subs    x2,x2,#64
493         umull   v23.2d,v14.2s,v7.s[2]
494         csel    x16,x17,x16,lo
495         umull   v22.2d,v14.2s,v5.s[2]
496         umull   v21.2d,v14.2s,v3.s[2]
497         ldp     x8,x12,[x16],#16        // inp[2:3] (or zero)
498         umull   v20.2d,v14.2s,v1.s[2]
499         ldp     x9,x13,[x16],#48
500         umull   v19.2d,v14.2s,v0.s[2]
501 #ifdef  __ARMEB__
502         rev     x8,x8
503         rev     x12,x12
504         rev     x9,x9
505         rev     x13,x13
506 #endif
507
508         umlal   v23.2d,v15.2s,v5.s[2]
509         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
510         umlal   v22.2d,v15.2s,v3.s[2]
511         and     x5,x9,#0x03ffffff
512         umlal   v21.2d,v15.2s,v1.s[2]
513         ubfx    x6,x8,#26,#26
514         umlal   v20.2d,v15.2s,v0.s[2]
515         ubfx    x7,x9,#26,#26
516         umlal   v19.2d,v15.2s,v8.s[2]
517         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
518
519         umlal   v23.2d,v16.2s,v3.s[2]
520         extr    x8,x12,x8,#52
521         umlal   v22.2d,v16.2s,v1.s[2]
522         extr    x9,x13,x9,#52
523         umlal   v21.2d,v16.2s,v0.s[2]
524         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
525         umlal   v20.2d,v16.2s,v8.s[2]
526         fmov    d14,x4
527         umlal   v19.2d,v16.2s,v6.s[2]
528         and     x8,x8,#0x03ffffff
529
530         umlal   v23.2d,v17.2s,v1.s[2]
531         and     x9,x9,#0x03ffffff
532         umlal   v22.2d,v17.2s,v0.s[2]
533         ubfx    x10,x12,#14,#26
534         umlal   v21.2d,v17.2s,v8.s[2]
535         ubfx    x11,x13,#14,#26
536         umlal   v20.2d,v17.2s,v6.s[2]
537         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
538         umlal   v19.2d,v17.2s,v4.s[2]
539         fmov    d15,x6
540
541         add     v11.2s,v11.2s,v26.2s
542         add     x12,x3,x12,lsr#40
543         umlal   v23.2d,v18.2s,v0.s[2]
544         add     x13,x3,x13,lsr#40
545         umlal   v22.2d,v18.2s,v8.s[2]
546         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
547         umlal   v21.2d,v18.2s,v6.s[2]
548         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
549         umlal   v20.2d,v18.2s,v4.s[2]
550         fmov    d16,x8
551         umlal   v19.2d,v18.2s,v2.s[2]
552         fmov    d17,x10
553
554         ////////////////////////////////////////////////////////////////
555         // (hash+inp[0:1])*r^4 and accumulate
556
557         add     v9.2s,v9.2s,v24.2s
558         fmov    d18,x12
559         umlal   v22.2d,v11.2s,v1.s[0]
560         ldp     x8,x12,[x1],#16 // inp[0:1]
561         umlal   v19.2d,v11.2s,v6.s[0]
562         ldp     x9,x13,[x1],#48
563         umlal   v23.2d,v11.2s,v3.s[0]
564         umlal   v20.2d,v11.2s,v8.s[0]
565         umlal   v21.2d,v11.2s,v0.s[0]
566 #ifdef  __ARMEB__
567         rev     x8,x8
568         rev     x12,x12
569         rev     x9,x9
570         rev     x13,x13
571 #endif
572
573         add     v10.2s,v10.2s,v25.2s
574         umlal   v22.2d,v9.2s,v5.s[0]
575         umlal   v23.2d,v9.2s,v7.s[0]
576         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
577         umlal   v21.2d,v9.2s,v3.s[0]
578         and     x5,x9,#0x03ffffff
579         umlal   v19.2d,v9.2s,v0.s[0]
580         ubfx    x6,x8,#26,#26
581         umlal   v20.2d,v9.2s,v1.s[0]
582         ubfx    x7,x9,#26,#26
583
584         add     v12.2s,v12.2s,v27.2s
585         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
586         umlal   v22.2d,v10.2s,v3.s[0]
587         extr    x8,x12,x8,#52
588         umlal   v23.2d,v10.2s,v5.s[0]
589         extr    x9,x13,x9,#52
590         umlal   v19.2d,v10.2s,v8.s[0]
591         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
592         umlal   v21.2d,v10.2s,v1.s[0]
593         fmov    d9,x4
594         umlal   v20.2d,v10.2s,v0.s[0]
595         and     x8,x8,#0x03ffffff
596
597         add     v13.2s,v13.2s,v28.2s
598         and     x9,x9,#0x03ffffff
599         umlal   v22.2d,v12.2s,v0.s[0]
600         ubfx    x10,x12,#14,#26
601         umlal   v19.2d,v12.2s,v4.s[0]
602         ubfx    x11,x13,#14,#26
603         umlal   v23.2d,v12.2s,v1.s[0]
604         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
605         umlal   v20.2d,v12.2s,v6.s[0]
606         fmov    d10,x6
607         umlal   v21.2d,v12.2s,v8.s[0]
608         add     x12,x3,x12,lsr#40
609
610         umlal   v22.2d,v13.2s,v8.s[0]
611         add     x13,x3,x13,lsr#40
612         umlal   v19.2d,v13.2s,v2.s[0]
613         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
614         umlal   v23.2d,v13.2s,v0.s[0]
615         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
616         umlal   v20.2d,v13.2s,v4.s[0]
617         fmov    d11,x8
618         umlal   v21.2d,v13.2s,v6.s[0]
619         fmov    d12,x10
620         fmov    d13,x12
621
622         /////////////////////////////////////////////////////////////////
623         // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
624         // and P. Schwabe
625         //
626         // [see discussion in poly1305-armv4 module]
627
628         ushr    v29.2d,v22.2d,#26
629         xtn     v27.2s,v22.2d
630         ushr    v30.2d,v19.2d,#26
631         and     v19.16b,v19.16b,v31.16b
632         add     v23.2d,v23.2d,v29.2d    // h3 -> h4
633         bic     v27.2s,#0xfc,lsl#24     // &=0x03ffffff
634         add     v20.2d,v20.2d,v30.2d    // h0 -> h1
635
636         ushr    v29.2d,v23.2d,#26
637         xtn     v28.2s,v23.2d
638         ushr    v30.2d,v20.2d,#26
639         xtn     v25.2s,v20.2d
640         bic     v28.2s,#0xfc,lsl#24
641         add     v21.2d,v21.2d,v30.2d    // h1 -> h2
642
643         add     v19.2d,v19.2d,v29.2d
644         shl     v29.2d,v29.2d,#2
645         shrn    v30.2s,v21.2d,#26
646         xtn     v26.2s,v21.2d
647         add     v19.2d,v19.2d,v29.2d    // h4 -> h0
648         bic     v25.2s,#0xfc,lsl#24
649         add     v27.2s,v27.2s,v30.2s            // h2 -> h3
650         bic     v26.2s,#0xfc,lsl#24
651
652         shrn    v29.2s,v19.2d,#26
653         xtn     v24.2s,v19.2d
654         ushr    v30.2s,v27.2s,#26
655         bic     v27.2s,#0xfc,lsl#24
656         bic     v24.2s,#0xfc,lsl#24
657         add     v25.2s,v25.2s,v29.2s            // h0 -> h1
658         add     v28.2s,v28.2s,v30.2s            // h3 -> h4
659
660         b.hi    .Loop_neon
661
662 .Lskip_loop:
663         dup     v16.2d,v16.d[0]
664         add     v11.2s,v11.2s,v26.2s
665
666         ////////////////////////////////////////////////////////////////
667         // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
668
669         adds    x2,x2,#32
670         b.ne    .Long_tail
671
672         dup     v16.2d,v11.d[0]
673         add     v14.2s,v9.2s,v24.2s
674         add     v17.2s,v12.2s,v27.2s
675         add     v15.2s,v10.2s,v25.2s
676         add     v18.2s,v13.2s,v28.2s
677
678 .Long_tail:
679         dup     v14.2d,v14.d[0]
680         umull2  v19.2d,v16.4s,v6.4s
681         umull2  v22.2d,v16.4s,v1.4s
682         umull2  v23.2d,v16.4s,v3.4s
683         umull2  v21.2d,v16.4s,v0.4s
684         umull2  v20.2d,v16.4s,v8.4s
685
686         dup     v15.2d,v15.d[0]
687         umlal2  v19.2d,v14.4s,v0.4s
688         umlal2  v21.2d,v14.4s,v3.4s
689         umlal2  v22.2d,v14.4s,v5.4s
690         umlal2  v23.2d,v14.4s,v7.4s
691         umlal2  v20.2d,v14.4s,v1.4s
692
693         dup     v17.2d,v17.d[0]
694         umlal2  v19.2d,v15.4s,v8.4s
695         umlal2  v22.2d,v15.4s,v3.4s
696         umlal2  v21.2d,v15.4s,v1.4s
697         umlal2  v23.2d,v15.4s,v5.4s
698         umlal2  v20.2d,v15.4s,v0.4s
699
700         dup     v18.2d,v18.d[0]
701         umlal2  v22.2d,v17.4s,v0.4s
702         umlal2  v23.2d,v17.4s,v1.4s
703         umlal2  v19.2d,v17.4s,v4.4s
704         umlal2  v20.2d,v17.4s,v6.4s
705         umlal2  v21.2d,v17.4s,v8.4s
706
707         umlal2  v22.2d,v18.4s,v8.4s
708         umlal2  v19.2d,v18.4s,v2.4s
709         umlal2  v23.2d,v18.4s,v0.4s
710         umlal2  v20.2d,v18.4s,v4.4s
711         umlal2  v21.2d,v18.4s,v6.4s
712
713         b.eq    .Lshort_tail
714
715         ////////////////////////////////////////////////////////////////
716         // (hash+inp[0:1])*r^4:r^3 and accumulate
717
718         add     v9.2s,v9.2s,v24.2s
719         umlal   v22.2d,v11.2s,v1.2s
720         umlal   v19.2d,v11.2s,v6.2s
721         umlal   v23.2d,v11.2s,v3.2s
722         umlal   v20.2d,v11.2s,v8.2s
723         umlal   v21.2d,v11.2s,v0.2s
724
725         add     v10.2s,v10.2s,v25.2s
726         umlal   v22.2d,v9.2s,v5.2s
727         umlal   v19.2d,v9.2s,v0.2s
728         umlal   v23.2d,v9.2s,v7.2s
729         umlal   v20.2d,v9.2s,v1.2s
730         umlal   v21.2d,v9.2s,v3.2s
731
732         add     v12.2s,v12.2s,v27.2s
733         umlal   v22.2d,v10.2s,v3.2s
734         umlal   v19.2d,v10.2s,v8.2s
735         umlal   v23.2d,v10.2s,v5.2s
736         umlal   v20.2d,v10.2s,v0.2s
737         umlal   v21.2d,v10.2s,v1.2s
738
739         add     v13.2s,v13.2s,v28.2s
740         umlal   v22.2d,v12.2s,v0.2s
741         umlal   v19.2d,v12.2s,v4.2s
742         umlal   v23.2d,v12.2s,v1.2s
743         umlal   v20.2d,v12.2s,v6.2s
744         umlal   v21.2d,v12.2s,v8.2s
745
746         umlal   v22.2d,v13.2s,v8.2s
747         umlal   v19.2d,v13.2s,v2.2s
748         umlal   v23.2d,v13.2s,v0.2s
749         umlal   v20.2d,v13.2s,v4.2s
750         umlal   v21.2d,v13.2s,v6.2s
751
752 .Lshort_tail:
753         ////////////////////////////////////////////////////////////////
754         // horizontal add
755
756         addp    v22.2d,v22.2d,v22.2d
757         ldp     d8,d9,[sp,#16]          // meet ABI requirements
758         addp    v19.2d,v19.2d,v19.2d
759         ldp     d10,d11,[sp,#32]
760         addp    v23.2d,v23.2d,v23.2d
761         ldp     d12,d13,[sp,#48]
762         addp    v20.2d,v20.2d,v20.2d
763         ldp     d14,d15,[sp,#64]
764         addp    v21.2d,v21.2d,v21.2d
765
766         ////////////////////////////////////////////////////////////////
767         // lazy reduction, but without narrowing
768
769         ushr    v29.2d,v22.2d,#26
770         and     v22.16b,v22.16b,v31.16b
771         ushr    v30.2d,v19.2d,#26
772         and     v19.16b,v19.16b,v31.16b
773
774         add     v23.2d,v23.2d,v29.2d    // h3 -> h4
775         add     v20.2d,v20.2d,v30.2d    // h0 -> h1
776
777         ushr    v29.2d,v23.2d,#26
778         and     v23.16b,v23.16b,v31.16b
779         ushr    v30.2d,v20.2d,#26
780         and     v20.16b,v20.16b,v31.16b
781         add     v21.2d,v21.2d,v30.2d    // h1 -> h2
782
783         add     v19.2d,v19.2d,v29.2d
784         shl     v29.2d,v29.2d,#2
785         ushr    v30.2d,v21.2d,#26
786         and     v21.16b,v21.16b,v31.16b
787         add     v19.2d,v19.2d,v29.2d    // h4 -> h0
788         add     v22.2d,v22.2d,v30.2d    // h2 -> h3
789
790         ushr    v29.2d,v19.2d,#26
791         and     v19.16b,v19.16b,v31.16b
792         ushr    v30.2d,v22.2d,#26
793         and     v22.16b,v22.16b,v31.16b
794         add     v20.2d,v20.2d,v29.2d    // h0 -> h1
795         add     v23.2d,v23.2d,v30.2d    // h3 -> h4
796
797         ////////////////////////////////////////////////////////////////
798         // write the result, can be partially reduced
799
800         st4     {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
801         st1     {v23.s}[0],[x0]
802
803 .Lno_data_neon:
804         ldr     x29,[sp],#80
805 .inst   0xd50323bf              // autiasp
806         ret
807 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
808
809 .type   poly1305_emit_neon,%function
810 .align  5
811 poly1305_emit_neon:
812         ldr     x17,[x0,#24]
813         cbz     x17,poly1305_emit
814
815         ldp     w10,w11,[x0]            // load hash value base 2^26
816         ldp     w12,w13,[x0,#8]
817         ldr     w14,[x0,#16]
818
819         add     x4,x10,x11,lsl#26       // base 2^26 -> base 2^64
820         lsr     x5,x12,#12
821         adds    x4,x4,x12,lsl#52
822         add     x5,x5,x13,lsl#14
823         adc     x5,x5,xzr
824         lsr     x6,x14,#24
825         adds    x5,x5,x14,lsl#40
826         adc     x6,x6,xzr               // can be partially reduced...
827
828         ldp     x10,x11,[x2]    // load nonce
829
830         and     x12,x6,#-4              // ... so reduce
831         add     x12,x12,x6,lsr#2
832         and     x6,x6,#3
833         adds    x4,x4,x12
834         adcs    x5,x5,xzr
835         adc     x6,x6,xzr
836
837         adds    x12,x4,#5               // compare to modulus
838         adcs    x13,x5,xzr
839         adc     x14,x6,xzr
840
841         tst     x14,#-4                 // see if it's carried/borrowed
842
843         csel    x4,x4,x12,eq
844         csel    x5,x5,x13,eq
845
846 #ifdef  __ARMEB__
847         ror     x10,x10,#32             // flip nonce words
848         ror     x11,x11,#32
849 #endif
850         adds    x4,x4,x10               // accumulate nonce
851         adc     x5,x5,x11
852 #ifdef  __ARMEB__
853         rev     x4,x4                   // flip output bytes
854         rev     x5,x5
855 #endif
856         stp     x4,x5,[x1]              // write result
857
858         ret
859 .size   poly1305_emit_neon,.-poly1305_emit_neon
860
861 .align  5
862 .Lzeros:
863 .long   0,0,0,0,0,0,0,0
864 .LOPENSSL_armcap_P:
865 #ifdef  __ILP32__
866 .long   OPENSSL_armcap_P-.
867 #else
868 .quad   OPENSSL_armcap_P-.
869 #endif
870 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
871 .align  2
872 .align  2