]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/openssl/aarch64/poly1305-armv8.S
Use a template assembly file to generate the embedded MFS.
[FreeBSD/FreeBSD.git] / sys / crypto / openssl / aarch64 / poly1305-armv8.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
3 #include "arm_arch.h"
4
5 .text
6
7 // forward "declarations" are required for Apple
8
9 .globl  poly1305_blocks
10 .globl  poly1305_emit
11
12 .globl  poly1305_init
13 .type   poly1305_init,%function
14 .align  5
15 poly1305_init:
16         cmp     x1,xzr
17         stp     xzr,xzr,[x0]            // zero hash value
18         stp     xzr,xzr,[x0,#16]        // [along with is_base2_26]
19
20         csel    x0,xzr,x0,eq
21         b.eq    .Lno_key
22
23 #ifdef  __ILP32__
24         ldrsw   x11,.LOPENSSL_armcap_P
25 #else
26         ldr     x11,.LOPENSSL_armcap_P
27 #endif
28         adr     x10,.LOPENSSL_armcap_P
29
30         ldp     x7,x8,[x1]              // load key
31         mov     x9,#0xfffffffc0fffffff
32         movk    x9,#0x0fff,lsl#48
33         ldr     w17,[x10,x11]
34 #ifdef  __ARMEB__
35         rev     x7,x7                   // flip bytes
36         rev     x8,x8
37 #endif
38         and     x7,x7,x9                // &=0ffffffc0fffffff
39         and     x9,x9,#-4
40         and     x8,x8,x9                // &=0ffffffc0ffffffc
41         stp     x7,x8,[x0,#32]  // save key value
42
43         tst     w17,#ARMV7_NEON
44
45         adr     x12,poly1305_blocks
46         adr     x7,poly1305_blocks_neon
47         adr     x13,poly1305_emit
48         adr     x8,poly1305_emit_neon
49
50         csel    x12,x12,x7,eq
51         csel    x13,x13,x8,eq
52
53 #ifdef  __ILP32__
54         stp     w12,w13,[x2]
55 #else
56         stp     x12,x13,[x2]
57 #endif
58
59         mov     x0,#1
60 .Lno_key:
61         ret
62 .size   poly1305_init,.-poly1305_init
63
64 .type   poly1305_blocks,%function
65 .align  5
66 poly1305_blocks:
67         ands    x2,x2,#-16
68         b.eq    .Lno_data
69
70         ldp     x4,x5,[x0]              // load hash value
71         ldp     x7,x8,[x0,#32]  // load key value
72         ldr     x6,[x0,#16]
73         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
74         b       .Loop
75
76 .align  5
77 .Loop:
78         ldp     x10,x11,[x1],#16        // load input
79         sub     x2,x2,#16
80 #ifdef  __ARMEB__
81         rev     x10,x10
82         rev     x11,x11
83 #endif
84         adds    x4,x4,x10               // accumulate input
85         adcs    x5,x5,x11
86
87         mul     x12,x4,x7               // h0*r0
88         adc     x6,x6,x3
89         umulh   x13,x4,x7
90
91         mul     x10,x5,x9               // h1*5*r1
92         umulh   x11,x5,x9
93
94         adds    x12,x12,x10
95         mul     x10,x4,x8               // h0*r1
96         adc     x13,x13,x11
97         umulh   x14,x4,x8
98
99         adds    x13,x13,x10
100         mul     x10,x5,x7               // h1*r0
101         adc     x14,x14,xzr
102         umulh   x11,x5,x7
103
104         adds    x13,x13,x10
105         mul     x10,x6,x9               // h2*5*r1
106         adc     x14,x14,x11
107         mul     x11,x6,x7               // h2*r0
108
109         adds    x13,x13,x10
110         adc     x14,x14,x11
111
112         and     x10,x14,#-4             // final reduction
113         and     x6,x14,#3
114         add     x10,x10,x14,lsr#2
115         adds    x4,x12,x10
116         adcs    x5,x13,xzr
117         adc     x6,x6,xzr
118
119         cbnz    x2,.Loop
120
121         stp     x4,x5,[x0]              // store hash value
122         str     x6,[x0,#16]
123
124 .Lno_data:
125         ret
126 .size   poly1305_blocks,.-poly1305_blocks
127
128 .type   poly1305_emit,%function
129 .align  5
130 poly1305_emit:
131         ldp     x4,x5,[x0]              // load hash base 2^64
132         ldr     x6,[x0,#16]
133         ldp     x10,x11,[x2]    // load nonce
134
135         adds    x12,x4,#5               // compare to modulus
136         adcs    x13,x5,xzr
137         adc     x14,x6,xzr
138
139         tst     x14,#-4                 // see if it's carried/borrowed
140
141         csel    x4,x4,x12,eq
142         csel    x5,x5,x13,eq
143
144 #ifdef  __ARMEB__
145         ror     x10,x10,#32             // flip nonce words
146         ror     x11,x11,#32
147 #endif
148         adds    x4,x4,x10               // accumulate nonce
149         adc     x5,x5,x11
150 #ifdef  __ARMEB__
151         rev     x4,x4                   // flip output bytes
152         rev     x5,x5
153 #endif
154         stp     x4,x5,[x1]              // write result
155
156         ret
157 .size   poly1305_emit,.-poly1305_emit
158 .type   poly1305_mult,%function
159 .align  5
160 poly1305_mult:
161         mul     x12,x4,x7               // h0*r0
162         umulh   x13,x4,x7
163
164         mul     x10,x5,x9               // h1*5*r1
165         umulh   x11,x5,x9
166
167         adds    x12,x12,x10
168         mul     x10,x4,x8               // h0*r1
169         adc     x13,x13,x11
170         umulh   x14,x4,x8
171
172         adds    x13,x13,x10
173         mul     x10,x5,x7               // h1*r0
174         adc     x14,x14,xzr
175         umulh   x11,x5,x7
176
177         adds    x13,x13,x10
178         mul     x10,x6,x9               // h2*5*r1
179         adc     x14,x14,x11
180         mul     x11,x6,x7               // h2*r0
181
182         adds    x13,x13,x10
183         adc     x14,x14,x11
184
185         and     x10,x14,#-4             // final reduction
186         and     x6,x14,#3
187         add     x10,x10,x14,lsr#2
188         adds    x4,x12,x10
189         adcs    x5,x13,xzr
190         adc     x6,x6,xzr
191
192         ret
193 .size   poly1305_mult,.-poly1305_mult
194
195 .type   poly1305_splat,%function
196 .align  5
197 poly1305_splat:
198         and     x12,x4,#0x03ffffff      // base 2^64 -> base 2^26
199         ubfx    x13,x4,#26,#26
200         extr    x14,x5,x4,#52
201         and     x14,x14,#0x03ffffff
202         ubfx    x15,x5,#14,#26
203         extr    x16,x6,x5,#40
204
205         str     w12,[x0,#16*0]  // r0
206         add     w12,w13,w13,lsl#2       // r1*5
207         str     w13,[x0,#16*1]  // r1
208         add     w13,w14,w14,lsl#2       // r2*5
209         str     w12,[x0,#16*2]  // s1
210         str     w14,[x0,#16*3]  // r2
211         add     w14,w15,w15,lsl#2       // r3*5
212         str     w13,[x0,#16*4]  // s2
213         str     w15,[x0,#16*5]  // r3
214         add     w15,w16,w16,lsl#2       // r4*5
215         str     w14,[x0,#16*6]  // s3
216         str     w16,[x0,#16*7]  // r4
217         str     w15,[x0,#16*8]  // s4
218
219         ret
220 .size   poly1305_splat,.-poly1305_splat
221
222 .type   poly1305_blocks_neon,%function
223 .align  5
224 poly1305_blocks_neon:
225         ldr     x17,[x0,#24]
226         cmp     x2,#128
227         b.hs    .Lblocks_neon
228         cbz     x17,poly1305_blocks
229
230 .Lblocks_neon:
231 .inst   0xd503233f              // paciasp
232         stp     x29,x30,[sp,#-80]!
233         add     x29,sp,#0
234
235         ands    x2,x2,#-16
236         b.eq    .Lno_data_neon
237
238         cbz     x17,.Lbase2_64_neon
239
240         ldp     w10,w11,[x0]            // load hash value base 2^26
241         ldp     w12,w13,[x0,#8]
242         ldr     w14,[x0,#16]
243
244         tst     x2,#31
245         b.eq    .Leven_neon
246
247         ldp     x7,x8,[x0,#32]  // load key value
248
249         add     x4,x10,x11,lsl#26       // base 2^26 -> base 2^64
250         lsr     x5,x12,#12
251         adds    x4,x4,x12,lsl#52
252         add     x5,x5,x13,lsl#14
253         adc     x5,x5,xzr
254         lsr     x6,x14,#24
255         adds    x5,x5,x14,lsl#40
256         adc     x14,x6,xzr              // can be partially reduced...
257
258         ldp     x12,x13,[x1],#16        // load input
259         sub     x2,x2,#16
260         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
261
262         and     x10,x14,#-4             // ... so reduce
263         and     x6,x14,#3
264         add     x10,x10,x14,lsr#2
265         adds    x4,x4,x10
266         adcs    x5,x5,xzr
267         adc     x6,x6,xzr
268
269 #ifdef  __ARMEB__
270         rev     x12,x12
271         rev     x13,x13
272 #endif
273         adds    x4,x4,x12               // accumulate input
274         adcs    x5,x5,x13
275         adc     x6,x6,x3
276
277         bl      poly1305_mult
278         ldr     x30,[sp,#8]
279
280         cbz     x3,.Lstore_base2_64_neon
281
282         and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
283         ubfx    x11,x4,#26,#26
284         extr    x12,x5,x4,#52
285         and     x12,x12,#0x03ffffff
286         ubfx    x13,x5,#14,#26
287         extr    x14,x6,x5,#40
288
289         cbnz    x2,.Leven_neon
290
291         stp     w10,w11,[x0]            // store hash value base 2^26
292         stp     w12,w13,[x0,#8]
293         str     w14,[x0,#16]
294         b       .Lno_data_neon
295
296 .align  4
297 .Lstore_base2_64_neon:
298         stp     x4,x5,[x0]              // store hash value base 2^64
299         stp     x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
300         b       .Lno_data_neon
301
302 .align  4
303 .Lbase2_64_neon:
304         ldp     x7,x8,[x0,#32]  // load key value
305
306         ldp     x4,x5,[x0]              // load hash value base 2^64
307         ldr     x6,[x0,#16]
308
309         tst     x2,#31
310         b.eq    .Linit_neon
311
312         ldp     x12,x13,[x1],#16        // load input
313         sub     x2,x2,#16
314         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
315 #ifdef  __ARMEB__
316         rev     x12,x12
317         rev     x13,x13
318 #endif
319         adds    x4,x4,x12               // accumulate input
320         adcs    x5,x5,x13
321         adc     x6,x6,x3
322
323         bl      poly1305_mult
324
325 .Linit_neon:
326         and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
327         ubfx    x11,x4,#26,#26
328         extr    x12,x5,x4,#52
329         and     x12,x12,#0x03ffffff
330         ubfx    x13,x5,#14,#26
331         extr    x14,x6,x5,#40
332
333         stp     d8,d9,[sp,#16]          // meet ABI requirements
334         stp     d10,d11,[sp,#32]
335         stp     d12,d13,[sp,#48]
336         stp     d14,d15,[sp,#64]
337
338         fmov    d24,x10
339         fmov    d25,x11
340         fmov    d26,x12
341         fmov    d27,x13
342         fmov    d28,x14
343
344         ////////////////////////////////// initialize r^n table
345         mov     x4,x7                   // r^1
346         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
347         mov     x5,x8
348         mov     x6,xzr
349         add     x0,x0,#48+12
350         bl      poly1305_splat
351
352         bl      poly1305_mult           // r^2
353         sub     x0,x0,#4
354         bl      poly1305_splat
355
356         bl      poly1305_mult           // r^3
357         sub     x0,x0,#4
358         bl      poly1305_splat
359
360         bl      poly1305_mult           // r^4
361         sub     x0,x0,#4
362         bl      poly1305_splat
363         ldr     x30,[sp,#8]
364
365         add     x16,x1,#32
366         adr     x17,.Lzeros
367         subs    x2,x2,#64
368         csel    x16,x17,x16,lo
369
370         mov     x4,#1
371         str     x4,[x0,#-24]            // set is_base2_26
372         sub     x0,x0,#48               // restore original x0
373         b       .Ldo_neon
374
375 .align  4
376 .Leven_neon:
377         add     x16,x1,#32
378         adr     x17,.Lzeros
379         subs    x2,x2,#64
380         csel    x16,x17,x16,lo
381
382         stp     d8,d9,[sp,#16]          // meet ABI requirements
383         stp     d10,d11,[sp,#32]
384         stp     d12,d13,[sp,#48]
385         stp     d14,d15,[sp,#64]
386
387         fmov    d24,x10
388         fmov    d25,x11
389         fmov    d26,x12
390         fmov    d27,x13
391         fmov    d28,x14
392
393 .Ldo_neon:
394         ldp     x8,x12,[x16],#16        // inp[2:3] (or zero)
395         ldp     x9,x13,[x16],#48
396
397         lsl     x3,x3,#24
398         add     x15,x0,#48
399
400 #ifdef  __ARMEB__
401         rev     x8,x8
402         rev     x12,x12
403         rev     x9,x9
404         rev     x13,x13
405 #endif
406         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
407         and     x5,x9,#0x03ffffff
408         ubfx    x6,x8,#26,#26
409         ubfx    x7,x9,#26,#26
410         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
411         extr    x8,x12,x8,#52
412         extr    x9,x13,x9,#52
413         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
414         fmov    d14,x4
415         and     x8,x8,#0x03ffffff
416         and     x9,x9,#0x03ffffff
417         ubfx    x10,x12,#14,#26
418         ubfx    x11,x13,#14,#26
419         add     x12,x3,x12,lsr#40
420         add     x13,x3,x13,lsr#40
421         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
422         fmov    d15,x6
423         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
424         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
425         fmov    d16,x8
426         fmov    d17,x10
427         fmov    d18,x12
428
429         ldp     x8,x12,[x1],#16 // inp[0:1]
430         ldp     x9,x13,[x1],#48
431
432         ld1     {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
433         ld1     {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
434         ld1     {v8.4s},[x15]
435
436 #ifdef  __ARMEB__
437         rev     x8,x8
438         rev     x12,x12
439         rev     x9,x9
440         rev     x13,x13
441 #endif
442         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
443         and     x5,x9,#0x03ffffff
444         ubfx    x6,x8,#26,#26
445         ubfx    x7,x9,#26,#26
446         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
447         extr    x8,x12,x8,#52
448         extr    x9,x13,x9,#52
449         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
450         fmov    d9,x4
451         and     x8,x8,#0x03ffffff
452         and     x9,x9,#0x03ffffff
453         ubfx    x10,x12,#14,#26
454         ubfx    x11,x13,#14,#26
455         add     x12,x3,x12,lsr#40
456         add     x13,x3,x13,lsr#40
457         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
458         fmov    d10,x6
459         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
460         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
461         movi    v31.2d,#-1
462         fmov    d11,x8
463         fmov    d12,x10
464         fmov    d13,x12
465         ushr    v31.2d,v31.2d,#38
466
467         b.ls    .Lskip_loop
468
469 .align  4
470 .Loop_neon:
471         ////////////////////////////////////////////////////////////////
472         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
473         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
474         //   ___________________/
475         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
476         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
477         //   ___________________/ ____________________/
478         //
479         // Note that we start with inp[2:3]*r^2. This is because it
480         // doesn't depend on reduction in previous iteration.
481         ////////////////////////////////////////////////////////////////
482         // d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
483         // d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
484         // d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
485         // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
486         // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
487
488         subs    x2,x2,#64
489         umull   v23.2d,v14.2s,v7.s[2]
490         csel    x16,x17,x16,lo
491         umull   v22.2d,v14.2s,v5.s[2]
492         umull   v21.2d,v14.2s,v3.s[2]
493         ldp     x8,x12,[x16],#16        // inp[2:3] (or zero)
494         umull   v20.2d,v14.2s,v1.s[2]
495         ldp     x9,x13,[x16],#48
496         umull   v19.2d,v14.2s,v0.s[2]
497 #ifdef  __ARMEB__
498         rev     x8,x8
499         rev     x12,x12
500         rev     x9,x9
501         rev     x13,x13
502 #endif
503
504         umlal   v23.2d,v15.2s,v5.s[2]
505         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
506         umlal   v22.2d,v15.2s,v3.s[2]
507         and     x5,x9,#0x03ffffff
508         umlal   v21.2d,v15.2s,v1.s[2]
509         ubfx    x6,x8,#26,#26
510         umlal   v20.2d,v15.2s,v0.s[2]
511         ubfx    x7,x9,#26,#26
512         umlal   v19.2d,v15.2s,v8.s[2]
513         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
514
515         umlal   v23.2d,v16.2s,v3.s[2]
516         extr    x8,x12,x8,#52
517         umlal   v22.2d,v16.2s,v1.s[2]
518         extr    x9,x13,x9,#52
519         umlal   v21.2d,v16.2s,v0.s[2]
520         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
521         umlal   v20.2d,v16.2s,v8.s[2]
522         fmov    d14,x4
523         umlal   v19.2d,v16.2s,v6.s[2]
524         and     x8,x8,#0x03ffffff
525
526         umlal   v23.2d,v17.2s,v1.s[2]
527         and     x9,x9,#0x03ffffff
528         umlal   v22.2d,v17.2s,v0.s[2]
529         ubfx    x10,x12,#14,#26
530         umlal   v21.2d,v17.2s,v8.s[2]
531         ubfx    x11,x13,#14,#26
532         umlal   v20.2d,v17.2s,v6.s[2]
533         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
534         umlal   v19.2d,v17.2s,v4.s[2]
535         fmov    d15,x6
536
537         add     v11.2s,v11.2s,v26.2s
538         add     x12,x3,x12,lsr#40
539         umlal   v23.2d,v18.2s,v0.s[2]
540         add     x13,x3,x13,lsr#40
541         umlal   v22.2d,v18.2s,v8.s[2]
542         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
543         umlal   v21.2d,v18.2s,v6.s[2]
544         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
545         umlal   v20.2d,v18.2s,v4.s[2]
546         fmov    d16,x8
547         umlal   v19.2d,v18.2s,v2.s[2]
548         fmov    d17,x10
549
550         ////////////////////////////////////////////////////////////////
551         // (hash+inp[0:1])*r^4 and accumulate
552
553         add     v9.2s,v9.2s,v24.2s
554         fmov    d18,x12
555         umlal   v22.2d,v11.2s,v1.s[0]
556         ldp     x8,x12,[x1],#16 // inp[0:1]
557         umlal   v19.2d,v11.2s,v6.s[0]
558         ldp     x9,x13,[x1],#48
559         umlal   v23.2d,v11.2s,v3.s[0]
560         umlal   v20.2d,v11.2s,v8.s[0]
561         umlal   v21.2d,v11.2s,v0.s[0]
562 #ifdef  __ARMEB__
563         rev     x8,x8
564         rev     x12,x12
565         rev     x9,x9
566         rev     x13,x13
567 #endif
568
569         add     v10.2s,v10.2s,v25.2s
570         umlal   v22.2d,v9.2s,v5.s[0]
571         umlal   v23.2d,v9.2s,v7.s[0]
572         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
573         umlal   v21.2d,v9.2s,v3.s[0]
574         and     x5,x9,#0x03ffffff
575         umlal   v19.2d,v9.2s,v0.s[0]
576         ubfx    x6,x8,#26,#26
577         umlal   v20.2d,v9.2s,v1.s[0]
578         ubfx    x7,x9,#26,#26
579
580         add     v12.2s,v12.2s,v27.2s
581         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
582         umlal   v22.2d,v10.2s,v3.s[0]
583         extr    x8,x12,x8,#52
584         umlal   v23.2d,v10.2s,v5.s[0]
585         extr    x9,x13,x9,#52
586         umlal   v19.2d,v10.2s,v8.s[0]
587         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
588         umlal   v21.2d,v10.2s,v1.s[0]
589         fmov    d9,x4
590         umlal   v20.2d,v10.2s,v0.s[0]
591         and     x8,x8,#0x03ffffff
592
593         add     v13.2s,v13.2s,v28.2s
594         and     x9,x9,#0x03ffffff
595         umlal   v22.2d,v12.2s,v0.s[0]
596         ubfx    x10,x12,#14,#26
597         umlal   v19.2d,v12.2s,v4.s[0]
598         ubfx    x11,x13,#14,#26
599         umlal   v23.2d,v12.2s,v1.s[0]
600         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
601         umlal   v20.2d,v12.2s,v6.s[0]
602         fmov    d10,x6
603         umlal   v21.2d,v12.2s,v8.s[0]
604         add     x12,x3,x12,lsr#40
605
606         umlal   v22.2d,v13.2s,v8.s[0]
607         add     x13,x3,x13,lsr#40
608         umlal   v19.2d,v13.2s,v2.s[0]
609         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
610         umlal   v23.2d,v13.2s,v0.s[0]
611         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
612         umlal   v20.2d,v13.2s,v4.s[0]
613         fmov    d11,x8
614         umlal   v21.2d,v13.2s,v6.s[0]
615         fmov    d12,x10
616         fmov    d13,x12
617
618         /////////////////////////////////////////////////////////////////
619         // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
620         // and P. Schwabe
621         //
622         // [see discussion in poly1305-armv4 module]
623
624         ushr    v29.2d,v22.2d,#26
625         xtn     v27.2s,v22.2d
626         ushr    v30.2d,v19.2d,#26
627         and     v19.16b,v19.16b,v31.16b
628         add     v23.2d,v23.2d,v29.2d    // h3 -> h4
629         bic     v27.2s,#0xfc,lsl#24     // &=0x03ffffff
630         add     v20.2d,v20.2d,v30.2d    // h0 -> h1
631
632         ushr    v29.2d,v23.2d,#26
633         xtn     v28.2s,v23.2d
634         ushr    v30.2d,v20.2d,#26
635         xtn     v25.2s,v20.2d
636         bic     v28.2s,#0xfc,lsl#24
637         add     v21.2d,v21.2d,v30.2d    // h1 -> h2
638
639         add     v19.2d,v19.2d,v29.2d
640         shl     v29.2d,v29.2d,#2
641         shrn    v30.2s,v21.2d,#26
642         xtn     v26.2s,v21.2d
643         add     v19.2d,v19.2d,v29.2d    // h4 -> h0
644         bic     v25.2s,#0xfc,lsl#24
645         add     v27.2s,v27.2s,v30.2s            // h2 -> h3
646         bic     v26.2s,#0xfc,lsl#24
647
648         shrn    v29.2s,v19.2d,#26
649         xtn     v24.2s,v19.2d
650         ushr    v30.2s,v27.2s,#26
651         bic     v27.2s,#0xfc,lsl#24
652         bic     v24.2s,#0xfc,lsl#24
653         add     v25.2s,v25.2s,v29.2s            // h0 -> h1
654         add     v28.2s,v28.2s,v30.2s            // h3 -> h4
655
656         b.hi    .Loop_neon
657
658 .Lskip_loop:
659         dup     v16.2d,v16.d[0]
660         add     v11.2s,v11.2s,v26.2s
661
662         ////////////////////////////////////////////////////////////////
663         // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
664
665         adds    x2,x2,#32
666         b.ne    .Long_tail
667
668         dup     v16.2d,v11.d[0]
669         add     v14.2s,v9.2s,v24.2s
670         add     v17.2s,v12.2s,v27.2s
671         add     v15.2s,v10.2s,v25.2s
672         add     v18.2s,v13.2s,v28.2s
673
674 .Long_tail:
675         dup     v14.2d,v14.d[0]
676         umull2  v19.2d,v16.4s,v6.4s
677         umull2  v22.2d,v16.4s,v1.4s
678         umull2  v23.2d,v16.4s,v3.4s
679         umull2  v21.2d,v16.4s,v0.4s
680         umull2  v20.2d,v16.4s,v8.4s
681
682         dup     v15.2d,v15.d[0]
683         umlal2  v19.2d,v14.4s,v0.4s
684         umlal2  v21.2d,v14.4s,v3.4s
685         umlal2  v22.2d,v14.4s,v5.4s
686         umlal2  v23.2d,v14.4s,v7.4s
687         umlal2  v20.2d,v14.4s,v1.4s
688
689         dup     v17.2d,v17.d[0]
690         umlal2  v19.2d,v15.4s,v8.4s
691         umlal2  v22.2d,v15.4s,v3.4s
692         umlal2  v21.2d,v15.4s,v1.4s
693         umlal2  v23.2d,v15.4s,v5.4s
694         umlal2  v20.2d,v15.4s,v0.4s
695
696         dup     v18.2d,v18.d[0]
697         umlal2  v22.2d,v17.4s,v0.4s
698         umlal2  v23.2d,v17.4s,v1.4s
699         umlal2  v19.2d,v17.4s,v4.4s
700         umlal2  v20.2d,v17.4s,v6.4s
701         umlal2  v21.2d,v17.4s,v8.4s
702
703         umlal2  v22.2d,v18.4s,v8.4s
704         umlal2  v19.2d,v18.4s,v2.4s
705         umlal2  v23.2d,v18.4s,v0.4s
706         umlal2  v20.2d,v18.4s,v4.4s
707         umlal2  v21.2d,v18.4s,v6.4s
708
709         b.eq    .Lshort_tail
710
711         ////////////////////////////////////////////////////////////////
712         // (hash+inp[0:1])*r^4:r^3 and accumulate
713
714         add     v9.2s,v9.2s,v24.2s
715         umlal   v22.2d,v11.2s,v1.2s
716         umlal   v19.2d,v11.2s,v6.2s
717         umlal   v23.2d,v11.2s,v3.2s
718         umlal   v20.2d,v11.2s,v8.2s
719         umlal   v21.2d,v11.2s,v0.2s
720
721         add     v10.2s,v10.2s,v25.2s
722         umlal   v22.2d,v9.2s,v5.2s
723         umlal   v19.2d,v9.2s,v0.2s
724         umlal   v23.2d,v9.2s,v7.2s
725         umlal   v20.2d,v9.2s,v1.2s
726         umlal   v21.2d,v9.2s,v3.2s
727
728         add     v12.2s,v12.2s,v27.2s
729         umlal   v22.2d,v10.2s,v3.2s
730         umlal   v19.2d,v10.2s,v8.2s
731         umlal   v23.2d,v10.2s,v5.2s
732         umlal   v20.2d,v10.2s,v0.2s
733         umlal   v21.2d,v10.2s,v1.2s
734
735         add     v13.2s,v13.2s,v28.2s
736         umlal   v22.2d,v12.2s,v0.2s
737         umlal   v19.2d,v12.2s,v4.2s
738         umlal   v23.2d,v12.2s,v1.2s
739         umlal   v20.2d,v12.2s,v6.2s
740         umlal   v21.2d,v12.2s,v8.2s
741
742         umlal   v22.2d,v13.2s,v8.2s
743         umlal   v19.2d,v13.2s,v2.2s
744         umlal   v23.2d,v13.2s,v0.2s
745         umlal   v20.2d,v13.2s,v4.2s
746         umlal   v21.2d,v13.2s,v6.2s
747
748 .Lshort_tail:
749         ////////////////////////////////////////////////////////////////
750         // horizontal add
751
752         addp    v22.2d,v22.2d,v22.2d
753         ldp     d8,d9,[sp,#16]          // meet ABI requirements
754         addp    v19.2d,v19.2d,v19.2d
755         ldp     d10,d11,[sp,#32]
756         addp    v23.2d,v23.2d,v23.2d
757         ldp     d12,d13,[sp,#48]
758         addp    v20.2d,v20.2d,v20.2d
759         ldp     d14,d15,[sp,#64]
760         addp    v21.2d,v21.2d,v21.2d
761
762         ////////////////////////////////////////////////////////////////
763         // lazy reduction, but without narrowing
764
765         ushr    v29.2d,v22.2d,#26
766         and     v22.16b,v22.16b,v31.16b
767         ushr    v30.2d,v19.2d,#26
768         and     v19.16b,v19.16b,v31.16b
769
770         add     v23.2d,v23.2d,v29.2d    // h3 -> h4
771         add     v20.2d,v20.2d,v30.2d    // h0 -> h1
772
773         ushr    v29.2d,v23.2d,#26
774         and     v23.16b,v23.16b,v31.16b
775         ushr    v30.2d,v20.2d,#26
776         and     v20.16b,v20.16b,v31.16b
777         add     v21.2d,v21.2d,v30.2d    // h1 -> h2
778
779         add     v19.2d,v19.2d,v29.2d
780         shl     v29.2d,v29.2d,#2
781         ushr    v30.2d,v21.2d,#26
782         and     v21.16b,v21.16b,v31.16b
783         add     v19.2d,v19.2d,v29.2d    // h4 -> h0
784         add     v22.2d,v22.2d,v30.2d    // h2 -> h3
785
786         ushr    v29.2d,v19.2d,#26
787         and     v19.16b,v19.16b,v31.16b
788         ushr    v30.2d,v22.2d,#26
789         and     v22.16b,v22.16b,v31.16b
790         add     v20.2d,v20.2d,v29.2d    // h0 -> h1
791         add     v23.2d,v23.2d,v30.2d    // h3 -> h4
792
793         ////////////////////////////////////////////////////////////////
794         // write the result, can be partially reduced
795
796         st4     {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
797         st1     {v23.s}[0],[x0]
798
799 .Lno_data_neon:
800 .inst   0xd50323bf              // autiasp
801         ldr     x29,[sp],#80
802         ret
803 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
804
805 .type   poly1305_emit_neon,%function
806 .align  5
807 poly1305_emit_neon:
808         ldr     x17,[x0,#24]
809         cbz     x17,poly1305_emit
810
811         ldp     w10,w11,[x0]            // load hash value base 2^26
812         ldp     w12,w13,[x0,#8]
813         ldr     w14,[x0,#16]
814
815         add     x4,x10,x11,lsl#26       // base 2^26 -> base 2^64
816         lsr     x5,x12,#12
817         adds    x4,x4,x12,lsl#52
818         add     x5,x5,x13,lsl#14
819         adc     x5,x5,xzr
820         lsr     x6,x14,#24
821         adds    x5,x5,x14,lsl#40
822         adc     x6,x6,xzr               // can be partially reduced...
823
824         ldp     x10,x11,[x2]    // load nonce
825
826         and     x12,x6,#-4              // ... so reduce
827         add     x12,x12,x6,lsr#2
828         and     x6,x6,#3
829         adds    x4,x4,x12
830         adcs    x5,x5,xzr
831         adc     x6,x6,xzr
832
833         adds    x12,x4,#5               // compare to modulus
834         adcs    x13,x5,xzr
835         adc     x14,x6,xzr
836
837         tst     x14,#-4                 // see if it's carried/borrowed
838
839         csel    x4,x4,x12,eq
840         csel    x5,x5,x13,eq
841
842 #ifdef  __ARMEB__
843         ror     x10,x10,#32             // flip nonce words
844         ror     x11,x11,#32
845 #endif
846         adds    x4,x4,x10               // accumulate nonce
847         adc     x5,x5,x11
848 #ifdef  __ARMEB__
849         rev     x4,x4                   // flip output bytes
850         rev     x5,x5
851 #endif
852         stp     x4,x5,[x1]              // write result
853
854         ret
855 .size   poly1305_emit_neon,.-poly1305_emit_neon
856
857 .align  5
858 .Lzeros:
859 .long   0,0,0,0,0,0,0,0
860 .LOPENSSL_armcap_P:
861 #ifdef  __ILP32__
862 .long   OPENSSL_armcap_P-.
863 #else
864 .quad   OPENSSL_armcap_P-.
865 #endif
866 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
867 .align  2
868 .align  2