]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/aarch64/poly1305-armv8.S
MFH r338661 through r339253.
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / aarch64 / poly1305-armv8.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
3 #include "arm_arch.h"
4
5 .text
6
7 // forward "declarations" are required for Apple
8
9 .globl  poly1305_blocks
10 .globl  poly1305_emit
11
12 .globl  poly1305_init
13 .type   poly1305_init,%function
14 .align  5
15 poly1305_init:
16         cmp     x1,xzr
17         stp     xzr,xzr,[x0]            // zero hash value
18         stp     xzr,xzr,[x0,#16]        // [along with is_base2_26]
19
20         csel    x0,xzr,x0,eq
21         b.eq    .Lno_key
22
23 #ifdef  __ILP32__
24         ldrsw   x11,.LOPENSSL_armcap_P
25 #else
26         ldr     x11,.LOPENSSL_armcap_P
27 #endif
28         adr     x10,.LOPENSSL_armcap_P
29
30         ldp     x7,x8,[x1]              // load key
31         mov     x9,#0xfffffffc0fffffff
32         movk    x9,#0x0fff,lsl#48
33         ldr     w17,[x10,x11]
34 #ifdef  __ARMEB__
35         rev     x7,x7                   // flip bytes
36         rev     x8,x8
37 #endif
38         and     x7,x7,x9                // &=0ffffffc0fffffff
39         and     x9,x9,#-4
40         and     x8,x8,x9                // &=0ffffffc0ffffffc
41         stp     x7,x8,[x0,#32]  // save key value
42
43         tst     w17,#ARMV7_NEON
44
45         adr     x12,poly1305_blocks
46         adr     x7,poly1305_blocks_neon
47         adr     x13,poly1305_emit
48         adr     x8,poly1305_emit_neon
49
50         csel    x12,x12,x7,eq
51         csel    x13,x13,x8,eq
52
53 #ifdef  __ILP32__
54         stp     w12,w13,[x2]
55 #else
56         stp     x12,x13,[x2]
57 #endif
58
59         mov     x0,#1
60 .Lno_key:
61         ret
62 .size   poly1305_init,.-poly1305_init
63
64 .type   poly1305_blocks,%function
65 .align  5
66 poly1305_blocks:
67         ands    x2,x2,#-16
68         b.eq    .Lno_data
69
70         ldp     x4,x5,[x0]              // load hash value
71         ldp     x7,x8,[x0,#32]  // load key value
72         ldr     x6,[x0,#16]
73         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
74         b       .Loop
75
76 .align  5
77 .Loop:
78         ldp     x10,x11,[x1],#16        // load input
79         sub     x2,x2,#16
80 #ifdef  __ARMEB__
81         rev     x10,x10
82         rev     x11,x11
83 #endif
84         adds    x4,x4,x10               // accumulate input
85         adcs    x5,x5,x11
86
87         mul     x12,x4,x7               // h0*r0
88         adc     x6,x6,x3
89         umulh   x13,x4,x7
90
91         mul     x10,x5,x9               // h1*5*r1
92         umulh   x11,x5,x9
93
94         adds    x12,x12,x10
95         mul     x10,x4,x8               // h0*r1
96         adc     x13,x13,x11
97         umulh   x14,x4,x8
98
99         adds    x13,x13,x10
100         mul     x10,x5,x7               // h1*r0
101         adc     x14,x14,xzr
102         umulh   x11,x5,x7
103
104         adds    x13,x13,x10
105         mul     x10,x6,x9               // h2*5*r1
106         adc     x14,x14,x11
107         mul     x11,x6,x7               // h2*r0
108
109         adds    x13,x13,x10
110         adc     x14,x14,x11
111
112         and     x10,x14,#-4             // final reduction
113         and     x6,x14,#3
114         add     x10,x10,x14,lsr#2
115         adds    x4,x12,x10
116         adcs    x5,x13,xzr
117         adc     x6,x6,xzr
118
119         cbnz    x2,.Loop
120
121         stp     x4,x5,[x0]              // store hash value
122         str     x6,[x0,#16]
123
124 .Lno_data:
125         ret
126 .size   poly1305_blocks,.-poly1305_blocks
127
128 .type   poly1305_emit,%function
129 .align  5
130 poly1305_emit:
131         ldp     x4,x5,[x0]              // load hash base 2^64
132         ldr     x6,[x0,#16]
133         ldp     x10,x11,[x2]    // load nonce
134
135         adds    x12,x4,#5               // compare to modulus
136         adcs    x13,x5,xzr
137         adc     x14,x6,xzr
138
139         tst     x14,#-4                 // see if it's carried/borrowed
140
141         csel    x4,x4,x12,eq
142         csel    x5,x5,x13,eq
143
144 #ifdef  __ARMEB__
145         ror     x10,x10,#32             // flip nonce words
146         ror     x11,x11,#32
147 #endif
148         adds    x4,x4,x10               // accumulate nonce
149         adc     x5,x5,x11
150 #ifdef  __ARMEB__
151         rev     x4,x4                   // flip output bytes
152         rev     x5,x5
153 #endif
154         stp     x4,x5,[x1]              // write result
155
156         ret
157 .size   poly1305_emit,.-poly1305_emit
158 .type   poly1305_mult,%function
159 .align  5
160 poly1305_mult:
161         mul     x12,x4,x7               // h0*r0
162         umulh   x13,x4,x7
163
164         mul     x10,x5,x9               // h1*5*r1
165         umulh   x11,x5,x9
166
167         adds    x12,x12,x10
168         mul     x10,x4,x8               // h0*r1
169         adc     x13,x13,x11
170         umulh   x14,x4,x8
171
172         adds    x13,x13,x10
173         mul     x10,x5,x7               // h1*r0
174         adc     x14,x14,xzr
175         umulh   x11,x5,x7
176
177         adds    x13,x13,x10
178         mul     x10,x6,x9               // h2*5*r1
179         adc     x14,x14,x11
180         mul     x11,x6,x7               // h2*r0
181
182         adds    x13,x13,x10
183         adc     x14,x14,x11
184
185         and     x10,x14,#-4             // final reduction
186         and     x6,x14,#3
187         add     x10,x10,x14,lsr#2
188         adds    x4,x12,x10
189         adcs    x5,x13,xzr
190         adc     x6,x6,xzr
191
192         ret
193 .size   poly1305_mult,.-poly1305_mult
194
195 .type   poly1305_splat,%function
196 .align  5
197 poly1305_splat:
198         and     x12,x4,#0x03ffffff      // base 2^64 -> base 2^26
199         ubfx    x13,x4,#26,#26
200         extr    x14,x5,x4,#52
201         and     x14,x14,#0x03ffffff
202         ubfx    x15,x5,#14,#26
203         extr    x16,x6,x5,#40
204
205         str     w12,[x0,#16*0]  // r0
206         add     w12,w13,w13,lsl#2       // r1*5
207         str     w13,[x0,#16*1]  // r1
208         add     w13,w14,w14,lsl#2       // r2*5
209         str     w12,[x0,#16*2]  // s1
210         str     w14,[x0,#16*3]  // r2
211         add     w14,w15,w15,lsl#2       // r3*5
212         str     w13,[x0,#16*4]  // s2
213         str     w15,[x0,#16*5]  // r3
214         add     w15,w16,w16,lsl#2       // r4*5
215         str     w14,[x0,#16*6]  // s3
216         str     w16,[x0,#16*7]  // r4
217         str     w15,[x0,#16*8]  // s4
218
219         ret
220 .size   poly1305_splat,.-poly1305_splat
221
222 .type   poly1305_blocks_neon,%function
223 .align  5
224 poly1305_blocks_neon:
225         ldr     x17,[x0,#24]
226         cmp     x2,#128
227         b.hs    .Lblocks_neon
228         cbz     x17,poly1305_blocks
229
230 .Lblocks_neon:
231         stp     x29,x30,[sp,#-80]!
232         add     x29,sp,#0
233
234         ands    x2,x2,#-16
235         b.eq    .Lno_data_neon
236
237         cbz     x17,.Lbase2_64_neon
238
239         ldp     w10,w11,[x0]            // load hash value base 2^26
240         ldp     w12,w13,[x0,#8]
241         ldr     w14,[x0,#16]
242
243         tst     x2,#31
244         b.eq    .Leven_neon
245
246         ldp     x7,x8,[x0,#32]  // load key value
247
248         add     x4,x10,x11,lsl#26       // base 2^26 -> base 2^64
249         lsr     x5,x12,#12
250         adds    x4,x4,x12,lsl#52
251         add     x5,x5,x13,lsl#14
252         adc     x5,x5,xzr
253         lsr     x6,x14,#24
254         adds    x5,x5,x14,lsl#40
255         adc     x14,x6,xzr              // can be partially reduced...
256
257         ldp     x12,x13,[x1],#16        // load input
258         sub     x2,x2,#16
259         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
260
261         and     x10,x14,#-4             // ... so reduce
262         and     x6,x14,#3
263         add     x10,x10,x14,lsr#2
264         adds    x4,x4,x10
265         adcs    x5,x5,xzr
266         adc     x6,x6,xzr
267
268 #ifdef  __ARMEB__
269         rev     x12,x12
270         rev     x13,x13
271 #endif
272         adds    x4,x4,x12               // accumulate input
273         adcs    x5,x5,x13
274         adc     x6,x6,x3
275
276         bl      poly1305_mult
277         ldr     x30,[sp,#8]
278
279         cbz     x3,.Lstore_base2_64_neon
280
281         and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
282         ubfx    x11,x4,#26,#26
283         extr    x12,x5,x4,#52
284         and     x12,x12,#0x03ffffff
285         ubfx    x13,x5,#14,#26
286         extr    x14,x6,x5,#40
287
288         cbnz    x2,.Leven_neon
289
290         stp     w10,w11,[x0]            // store hash value base 2^26
291         stp     w12,w13,[x0,#8]
292         str     w14,[x0,#16]
293         b       .Lno_data_neon
294
295 .align  4
296 .Lstore_base2_64_neon:
297         stp     x4,x5,[x0]              // store hash value base 2^64
298         stp     x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
299         b       .Lno_data_neon
300
301 .align  4
302 .Lbase2_64_neon:
303         ldp     x7,x8,[x0,#32]  // load key value
304
305         ldp     x4,x5,[x0]              // load hash value base 2^64
306         ldr     x6,[x0,#16]
307
308         tst     x2,#31
309         b.eq    .Linit_neon
310
311         ldp     x12,x13,[x1],#16        // load input
312         sub     x2,x2,#16
313         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
314 #ifdef  __ARMEB__
315         rev     x12,x12
316         rev     x13,x13
317 #endif
318         adds    x4,x4,x12               // accumulate input
319         adcs    x5,x5,x13
320         adc     x6,x6,x3
321
322         bl      poly1305_mult
323
324 .Linit_neon:
325         and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
326         ubfx    x11,x4,#26,#26
327         extr    x12,x5,x4,#52
328         and     x12,x12,#0x03ffffff
329         ubfx    x13,x5,#14,#26
330         extr    x14,x6,x5,#40
331
332         stp     d8,d9,[sp,#16]          // meet ABI requirements
333         stp     d10,d11,[sp,#32]
334         stp     d12,d13,[sp,#48]
335         stp     d14,d15,[sp,#64]
336
337         fmov    d24,x10
338         fmov    d25,x11
339         fmov    d26,x12
340         fmov    d27,x13
341         fmov    d28,x14
342
343         ////////////////////////////////// initialize r^n table
344         mov     x4,x7                   // r^1
345         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
346         mov     x5,x8
347         mov     x6,xzr
348         add     x0,x0,#48+12
349         bl      poly1305_splat
350
351         bl      poly1305_mult           // r^2
352         sub     x0,x0,#4
353         bl      poly1305_splat
354
355         bl      poly1305_mult           // r^3
356         sub     x0,x0,#4
357         bl      poly1305_splat
358
359         bl      poly1305_mult           // r^4
360         sub     x0,x0,#4
361         bl      poly1305_splat
362         ldr     x30,[sp,#8]
363
364         add     x16,x1,#32
365         adr     x17,.Lzeros
366         subs    x2,x2,#64
367         csel    x16,x17,x16,lo
368
369         mov     x4,#1
370         str     x4,[x0,#-24]            // set is_base2_26
371         sub     x0,x0,#48               // restore original x0
372         b       .Ldo_neon
373
374 .align  4
375 .Leven_neon:
376         add     x16,x1,#32
377         adr     x17,.Lzeros
378         subs    x2,x2,#64
379         csel    x16,x17,x16,lo
380
381         stp     d8,d9,[sp,#16]          // meet ABI requirements
382         stp     d10,d11,[sp,#32]
383         stp     d12,d13,[sp,#48]
384         stp     d14,d15,[sp,#64]
385
386         fmov    d24,x10
387         fmov    d25,x11
388         fmov    d26,x12
389         fmov    d27,x13
390         fmov    d28,x14
391
392 .Ldo_neon:
393         ldp     x8,x12,[x16],#16        // inp[2:3] (or zero)
394         ldp     x9,x13,[x16],#48
395
396         lsl     x3,x3,#24
397         add     x15,x0,#48
398
399 #ifdef  __ARMEB__
400         rev     x8,x8
401         rev     x12,x12
402         rev     x9,x9
403         rev     x13,x13
404 #endif
405         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
406         and     x5,x9,#0x03ffffff
407         ubfx    x6,x8,#26,#26
408         ubfx    x7,x9,#26,#26
409         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
410         extr    x8,x12,x8,#52
411         extr    x9,x13,x9,#52
412         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
413         fmov    d14,x4
414         and     x8,x8,#0x03ffffff
415         and     x9,x9,#0x03ffffff
416         ubfx    x10,x12,#14,#26
417         ubfx    x11,x13,#14,#26
418         add     x12,x3,x12,lsr#40
419         add     x13,x3,x13,lsr#40
420         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
421         fmov    d15,x6
422         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
423         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
424         fmov    d16,x8
425         fmov    d17,x10
426         fmov    d18,x12
427
428         ldp     x8,x12,[x1],#16 // inp[0:1]
429         ldp     x9,x13,[x1],#48
430
431         ld1     {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
432         ld1     {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
433         ld1     {v8.4s},[x15]
434
435 #ifdef  __ARMEB__
436         rev     x8,x8
437         rev     x12,x12
438         rev     x9,x9
439         rev     x13,x13
440 #endif
441         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
442         and     x5,x9,#0x03ffffff
443         ubfx    x6,x8,#26,#26
444         ubfx    x7,x9,#26,#26
445         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
446         extr    x8,x12,x8,#52
447         extr    x9,x13,x9,#52
448         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
449         fmov    d9,x4
450         and     x8,x8,#0x03ffffff
451         and     x9,x9,#0x03ffffff
452         ubfx    x10,x12,#14,#26
453         ubfx    x11,x13,#14,#26
454         add     x12,x3,x12,lsr#40
455         add     x13,x3,x13,lsr#40
456         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
457         fmov    d10,x6
458         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
459         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
460         movi    v31.2d,#-1
461         fmov    d11,x8
462         fmov    d12,x10
463         fmov    d13,x12
464         ushr    v31.2d,v31.2d,#38
465
466         b.ls    .Lskip_loop
467
468 .align  4
469 .Loop_neon:
470         ////////////////////////////////////////////////////////////////
471         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
472         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
473         //   ___________________/
474         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
475         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
476         //   ___________________/ ____________________/
477         //
478         // Note that we start with inp[2:3]*r^2. This is because it
479         // doesn't depend on reduction in previous iteration.
480         ////////////////////////////////////////////////////////////////
481         // d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
482         // d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
483         // d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
484         // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
485         // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
486
487         subs    x2,x2,#64
488         umull   v23.2d,v14.2s,v7.s[2]
489         csel    x16,x17,x16,lo
490         umull   v22.2d,v14.2s,v5.s[2]
491         umull   v21.2d,v14.2s,v3.s[2]
492         ldp     x8,x12,[x16],#16        // inp[2:3] (or zero)
493         umull   v20.2d,v14.2s,v1.s[2]
494         ldp     x9,x13,[x16],#48
495         umull   v19.2d,v14.2s,v0.s[2]
496 #ifdef  __ARMEB__
497         rev     x8,x8
498         rev     x12,x12
499         rev     x9,x9
500         rev     x13,x13
501 #endif
502
503         umlal   v23.2d,v15.2s,v5.s[2]
504         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
505         umlal   v22.2d,v15.2s,v3.s[2]
506         and     x5,x9,#0x03ffffff
507         umlal   v21.2d,v15.2s,v1.s[2]
508         ubfx    x6,x8,#26,#26
509         umlal   v20.2d,v15.2s,v0.s[2]
510         ubfx    x7,x9,#26,#26
511         umlal   v19.2d,v15.2s,v8.s[2]
512         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
513
514         umlal   v23.2d,v16.2s,v3.s[2]
515         extr    x8,x12,x8,#52
516         umlal   v22.2d,v16.2s,v1.s[2]
517         extr    x9,x13,x9,#52
518         umlal   v21.2d,v16.2s,v0.s[2]
519         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
520         umlal   v20.2d,v16.2s,v8.s[2]
521         fmov    d14,x4
522         umlal   v19.2d,v16.2s,v6.s[2]
523         and     x8,x8,#0x03ffffff
524
525         umlal   v23.2d,v17.2s,v1.s[2]
526         and     x9,x9,#0x03ffffff
527         umlal   v22.2d,v17.2s,v0.s[2]
528         ubfx    x10,x12,#14,#26
529         umlal   v21.2d,v17.2s,v8.s[2]
530         ubfx    x11,x13,#14,#26
531         umlal   v20.2d,v17.2s,v6.s[2]
532         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
533         umlal   v19.2d,v17.2s,v4.s[2]
534         fmov    d15,x6
535
536         add     v11.2s,v11.2s,v26.2s
537         add     x12,x3,x12,lsr#40
538         umlal   v23.2d,v18.2s,v0.s[2]
539         add     x13,x3,x13,lsr#40
540         umlal   v22.2d,v18.2s,v8.s[2]
541         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
542         umlal   v21.2d,v18.2s,v6.s[2]
543         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
544         umlal   v20.2d,v18.2s,v4.s[2]
545         fmov    d16,x8
546         umlal   v19.2d,v18.2s,v2.s[2]
547         fmov    d17,x10
548
549         ////////////////////////////////////////////////////////////////
550         // (hash+inp[0:1])*r^4 and accumulate
551
552         add     v9.2s,v9.2s,v24.2s
553         fmov    d18,x12
554         umlal   v22.2d,v11.2s,v1.s[0]
555         ldp     x8,x12,[x1],#16 // inp[0:1]
556         umlal   v19.2d,v11.2s,v6.s[0]
557         ldp     x9,x13,[x1],#48
558         umlal   v23.2d,v11.2s,v3.s[0]
559         umlal   v20.2d,v11.2s,v8.s[0]
560         umlal   v21.2d,v11.2s,v0.s[0]
561 #ifdef  __ARMEB__
562         rev     x8,x8
563         rev     x12,x12
564         rev     x9,x9
565         rev     x13,x13
566 #endif
567
568         add     v10.2s,v10.2s,v25.2s
569         umlal   v22.2d,v9.2s,v5.s[0]
570         umlal   v23.2d,v9.2s,v7.s[0]
571         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
572         umlal   v21.2d,v9.2s,v3.s[0]
573         and     x5,x9,#0x03ffffff
574         umlal   v19.2d,v9.2s,v0.s[0]
575         ubfx    x6,x8,#26,#26
576         umlal   v20.2d,v9.2s,v1.s[0]
577         ubfx    x7,x9,#26,#26
578
579         add     v12.2s,v12.2s,v27.2s
580         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
581         umlal   v22.2d,v10.2s,v3.s[0]
582         extr    x8,x12,x8,#52
583         umlal   v23.2d,v10.2s,v5.s[0]
584         extr    x9,x13,x9,#52
585         umlal   v19.2d,v10.2s,v8.s[0]
586         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
587         umlal   v21.2d,v10.2s,v1.s[0]
588         fmov    d9,x4
589         umlal   v20.2d,v10.2s,v0.s[0]
590         and     x8,x8,#0x03ffffff
591
592         add     v13.2s,v13.2s,v28.2s
593         and     x9,x9,#0x03ffffff
594         umlal   v22.2d,v12.2s,v0.s[0]
595         ubfx    x10,x12,#14,#26
596         umlal   v19.2d,v12.2s,v4.s[0]
597         ubfx    x11,x13,#14,#26
598         umlal   v23.2d,v12.2s,v1.s[0]
599         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
600         umlal   v20.2d,v12.2s,v6.s[0]
601         fmov    d10,x6
602         umlal   v21.2d,v12.2s,v8.s[0]
603         add     x12,x3,x12,lsr#40
604
605         umlal   v22.2d,v13.2s,v8.s[0]
606         add     x13,x3,x13,lsr#40
607         umlal   v19.2d,v13.2s,v2.s[0]
608         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
609         umlal   v23.2d,v13.2s,v0.s[0]
610         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
611         umlal   v20.2d,v13.2s,v4.s[0]
612         fmov    d11,x8
613         umlal   v21.2d,v13.2s,v6.s[0]
614         fmov    d12,x10
615         fmov    d13,x12
616
617         /////////////////////////////////////////////////////////////////
618         // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
619         // and P. Schwabe
620         //
621         // [see discussion in poly1305-armv4 module]
622
623         ushr    v29.2d,v22.2d,#26
624         xtn     v27.2s,v22.2d
625         ushr    v30.2d,v19.2d,#26
626         and     v19.16b,v19.16b,v31.16b
627         add     v23.2d,v23.2d,v29.2d    // h3 -> h4
628         bic     v27.2s,#0xfc,lsl#24     // &=0x03ffffff
629         add     v20.2d,v20.2d,v30.2d    // h0 -> h1
630
631         ushr    v29.2d,v23.2d,#26
632         xtn     v28.2s,v23.2d
633         ushr    v30.2d,v20.2d,#26
634         xtn     v25.2s,v20.2d
635         bic     v28.2s,#0xfc,lsl#24
636         add     v21.2d,v21.2d,v30.2d    // h1 -> h2
637
638         add     v19.2d,v19.2d,v29.2d
639         shl     v29.2d,v29.2d,#2
640         shrn    v30.2s,v21.2d,#26
641         xtn     v26.2s,v21.2d
642         add     v19.2d,v19.2d,v29.2d    // h4 -> h0
643         bic     v25.2s,#0xfc,lsl#24
644         add     v27.2s,v27.2s,v30.2s            // h2 -> h3
645         bic     v26.2s,#0xfc,lsl#24
646
647         shrn    v29.2s,v19.2d,#26
648         xtn     v24.2s,v19.2d
649         ushr    v30.2s,v27.2s,#26
650         bic     v27.2s,#0xfc,lsl#24
651         bic     v24.2s,#0xfc,lsl#24
652         add     v25.2s,v25.2s,v29.2s            // h0 -> h1
653         add     v28.2s,v28.2s,v30.2s            // h3 -> h4
654
655         b.hi    .Loop_neon
656
657 .Lskip_loop:
658         dup     v16.2d,v16.d[0]
659         add     v11.2s,v11.2s,v26.2s
660
661         ////////////////////////////////////////////////////////////////
662         // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
663
664         adds    x2,x2,#32
665         b.ne    .Long_tail
666
667         dup     v16.2d,v11.d[0]
668         add     v14.2s,v9.2s,v24.2s
669         add     v17.2s,v12.2s,v27.2s
670         add     v15.2s,v10.2s,v25.2s
671         add     v18.2s,v13.2s,v28.2s
672
673 .Long_tail:
674         dup     v14.2d,v14.d[0]
675         umull2  v19.2d,v16.4s,v6.4s
676         umull2  v22.2d,v16.4s,v1.4s
677         umull2  v23.2d,v16.4s,v3.4s
678         umull2  v21.2d,v16.4s,v0.4s
679         umull2  v20.2d,v16.4s,v8.4s
680
681         dup     v15.2d,v15.d[0]
682         umlal2  v19.2d,v14.4s,v0.4s
683         umlal2  v21.2d,v14.4s,v3.4s
684         umlal2  v22.2d,v14.4s,v5.4s
685         umlal2  v23.2d,v14.4s,v7.4s
686         umlal2  v20.2d,v14.4s,v1.4s
687
688         dup     v17.2d,v17.d[0]
689         umlal2  v19.2d,v15.4s,v8.4s
690         umlal2  v22.2d,v15.4s,v3.4s
691         umlal2  v21.2d,v15.4s,v1.4s
692         umlal2  v23.2d,v15.4s,v5.4s
693         umlal2  v20.2d,v15.4s,v0.4s
694
695         dup     v18.2d,v18.d[0]
696         umlal2  v22.2d,v17.4s,v0.4s
697         umlal2  v23.2d,v17.4s,v1.4s
698         umlal2  v19.2d,v17.4s,v4.4s
699         umlal2  v20.2d,v17.4s,v6.4s
700         umlal2  v21.2d,v17.4s,v8.4s
701
702         umlal2  v22.2d,v18.4s,v8.4s
703         umlal2  v19.2d,v18.4s,v2.4s
704         umlal2  v23.2d,v18.4s,v0.4s
705         umlal2  v20.2d,v18.4s,v4.4s
706         umlal2  v21.2d,v18.4s,v6.4s
707
708         b.eq    .Lshort_tail
709
710         ////////////////////////////////////////////////////////////////
711         // (hash+inp[0:1])*r^4:r^3 and accumulate
712
713         add     v9.2s,v9.2s,v24.2s
714         umlal   v22.2d,v11.2s,v1.2s
715         umlal   v19.2d,v11.2s,v6.2s
716         umlal   v23.2d,v11.2s,v3.2s
717         umlal   v20.2d,v11.2s,v8.2s
718         umlal   v21.2d,v11.2s,v0.2s
719
720         add     v10.2s,v10.2s,v25.2s
721         umlal   v22.2d,v9.2s,v5.2s
722         umlal   v19.2d,v9.2s,v0.2s
723         umlal   v23.2d,v9.2s,v7.2s
724         umlal   v20.2d,v9.2s,v1.2s
725         umlal   v21.2d,v9.2s,v3.2s
726
727         add     v12.2s,v12.2s,v27.2s
728         umlal   v22.2d,v10.2s,v3.2s
729         umlal   v19.2d,v10.2s,v8.2s
730         umlal   v23.2d,v10.2s,v5.2s
731         umlal   v20.2d,v10.2s,v0.2s
732         umlal   v21.2d,v10.2s,v1.2s
733
734         add     v13.2s,v13.2s,v28.2s
735         umlal   v22.2d,v12.2s,v0.2s
736         umlal   v19.2d,v12.2s,v4.2s
737         umlal   v23.2d,v12.2s,v1.2s
738         umlal   v20.2d,v12.2s,v6.2s
739         umlal   v21.2d,v12.2s,v8.2s
740
741         umlal   v22.2d,v13.2s,v8.2s
742         umlal   v19.2d,v13.2s,v2.2s
743         umlal   v23.2d,v13.2s,v0.2s
744         umlal   v20.2d,v13.2s,v4.2s
745         umlal   v21.2d,v13.2s,v6.2s
746
747 .Lshort_tail:
748         ////////////////////////////////////////////////////////////////
749         // horizontal add
750
751         addp    v22.2d,v22.2d,v22.2d
752         ldp     d8,d9,[sp,#16]          // meet ABI requirements
753         addp    v19.2d,v19.2d,v19.2d
754         ldp     d10,d11,[sp,#32]
755         addp    v23.2d,v23.2d,v23.2d
756         ldp     d12,d13,[sp,#48]
757         addp    v20.2d,v20.2d,v20.2d
758         ldp     d14,d15,[sp,#64]
759         addp    v21.2d,v21.2d,v21.2d
760
761         ////////////////////////////////////////////////////////////////
762         // lazy reduction, but without narrowing
763
764         ushr    v29.2d,v22.2d,#26
765         and     v22.16b,v22.16b,v31.16b
766         ushr    v30.2d,v19.2d,#26
767         and     v19.16b,v19.16b,v31.16b
768
769         add     v23.2d,v23.2d,v29.2d    // h3 -> h4
770         add     v20.2d,v20.2d,v30.2d    // h0 -> h1
771
772         ushr    v29.2d,v23.2d,#26
773         and     v23.16b,v23.16b,v31.16b
774         ushr    v30.2d,v20.2d,#26
775         and     v20.16b,v20.16b,v31.16b
776         add     v21.2d,v21.2d,v30.2d    // h1 -> h2
777
778         add     v19.2d,v19.2d,v29.2d
779         shl     v29.2d,v29.2d,#2
780         ushr    v30.2d,v21.2d,#26
781         and     v21.16b,v21.16b,v31.16b
782         add     v19.2d,v19.2d,v29.2d    // h4 -> h0
783         add     v22.2d,v22.2d,v30.2d    // h2 -> h3
784
785         ushr    v29.2d,v19.2d,#26
786         and     v19.16b,v19.16b,v31.16b
787         ushr    v30.2d,v22.2d,#26
788         and     v22.16b,v22.16b,v31.16b
789         add     v20.2d,v20.2d,v29.2d    // h0 -> h1
790         add     v23.2d,v23.2d,v30.2d    // h3 -> h4
791
792         ////////////////////////////////////////////////////////////////
793         // write the result, can be partially reduced
794
795         st4     {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
796         st1     {v23.s}[0],[x0]
797
798 .Lno_data_neon:
799         ldr     x29,[sp],#80
800         ret
801 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
802
803 .type   poly1305_emit_neon,%function
804 .align  5
805 poly1305_emit_neon:
806         ldr     x17,[x0,#24]
807         cbz     x17,poly1305_emit
808
809         ldp     w10,w11,[x0]            // load hash value base 2^26
810         ldp     w12,w13,[x0,#8]
811         ldr     w14,[x0,#16]
812
813         add     x4,x10,x11,lsl#26       // base 2^26 -> base 2^64
814         lsr     x5,x12,#12
815         adds    x4,x4,x12,lsl#52
816         add     x5,x5,x13,lsl#14
817         adc     x5,x5,xzr
818         lsr     x6,x14,#24
819         adds    x5,x5,x14,lsl#40
820         adc     x6,x6,xzr               // can be partially reduced...
821
822         ldp     x10,x11,[x2]    // load nonce
823
824         and     x12,x6,#-4              // ... so reduce
825         add     x12,x12,x6,lsr#2
826         and     x6,x6,#3
827         adds    x4,x4,x12
828         adcs    x5,x5,xzr
829         adc     x6,x6,xzr
830
831         adds    x12,x4,#5               // compare to modulus
832         adcs    x13,x5,xzr
833         adc     x14,x6,xzr
834
835         tst     x14,#-4                 // see if it's carried/borrowed
836
837         csel    x4,x4,x12,eq
838         csel    x5,x5,x13,eq
839
840 #ifdef  __ARMEB__
841         ror     x10,x10,#32             // flip nonce words
842         ror     x11,x11,#32
843 #endif
844         adds    x4,x4,x10               // accumulate nonce
845         adc     x5,x5,x11
846 #ifdef  __ARMEB__
847         rev     x4,x4                   // flip output bytes
848         rev     x5,x5
849 #endif
850         stp     x4,x5,[x1]              // write result
851
852         ret
853 .size   poly1305_emit_neon,.-poly1305_emit_neon
854
855 .align  5
856 .Lzeros:
857 .long   0,0,0,0,0,0,0,0
858 .LOPENSSL_armcap_P:
859 #ifdef  __ILP32__
860 .long   OPENSSL_armcap_P-.
861 #else
862 .quad   OPENSSL_armcap_P-.
863 #endif
864 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
865 .align  2
866 .align  2