]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/openssl/aarch64/chacha-armv8.S
THIS BRANCH IS OBSOLETE, PLEASE READ:
[FreeBSD/FreeBSD.git] / sys / crypto / openssl / aarch64 / chacha-armv8.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from chacha-armv8.pl. */
3 #include "arm_arch.h"
4
5 .text
6
7
8 .hidden OPENSSL_armcap_P
9
10 .align  5
11 .Lsigma:
12 .quad   0x3320646e61707865,0x6b20657479622d32           // endian-neutral
13 .Lone:
14 .long   1,0,0,0
15 .LOPENSSL_armcap_P:
16 #ifdef  __ILP32__
17 .long   OPENSSL_armcap_P-.
18 #else
19 .quad   OPENSSL_armcap_P-.
20 #endif
21 .byte   67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
22 .align  2
23
24 .globl  ChaCha20_ctr32
25 .type   ChaCha20_ctr32,%function
26 .align  5
27 ChaCha20_ctr32:
28         cbz     x2,.Labort
29         adr     x5,.LOPENSSL_armcap_P
30         cmp     x2,#192
31         b.lo    .Lshort
32 #ifdef  __ILP32__
33         ldrsw   x6,[x5]
34 #else
35         ldr     x6,[x5]
36 #endif
37         ldr     w17,[x6,x5]
38         tst     w17,#ARMV7_NEON
39         b.ne    ChaCha20_neon
40
41 .Lshort:
42 .inst   0xd503233f                      // paciasp
43         stp     x29,x30,[sp,#-96]!
44         add     x29,sp,#0
45
46         adr     x5,.Lsigma
47         stp     x19,x20,[sp,#16]
48         stp     x21,x22,[sp,#32]
49         stp     x23,x24,[sp,#48]
50         stp     x25,x26,[sp,#64]
51         stp     x27,x28,[sp,#80]
52         sub     sp,sp,#64
53
54         ldp     x22,x23,[x5]            // load sigma
55         ldp     x24,x25,[x3]            // load key
56         ldp     x26,x27,[x3,#16]
57         ldp     x28,x30,[x4]            // load counter
58 #ifdef  __ARMEB__
59         ror     x24,x24,#32
60         ror     x25,x25,#32
61         ror     x26,x26,#32
62         ror     x27,x27,#32
63         ror     x28,x28,#32
64         ror     x30,x30,#32
65 #endif
66
67 .Loop_outer:
68         mov     w5,w22                  // unpack key block
69         lsr     x6,x22,#32
70         mov     w7,w23
71         lsr     x8,x23,#32
72         mov     w9,w24
73         lsr     x10,x24,#32
74         mov     w11,w25
75         lsr     x12,x25,#32
76         mov     w13,w26
77         lsr     x14,x26,#32
78         mov     w15,w27
79         lsr     x16,x27,#32
80         mov     w17,w28
81         lsr     x19,x28,#32
82         mov     w20,w30
83         lsr     x21,x30,#32
84
85         mov     x4,#10
86         subs    x2,x2,#64
87 .Loop:
88         sub     x4,x4,#1
89         add     w5,w5,w9
90         add     w6,w6,w10
91         add     w7,w7,w11
92         add     w8,w8,w12
93         eor     w17,w17,w5
94         eor     w19,w19,w6
95         eor     w20,w20,w7
96         eor     w21,w21,w8
97         ror     w17,w17,#16
98         ror     w19,w19,#16
99         ror     w20,w20,#16
100         ror     w21,w21,#16
101         add     w13,w13,w17
102         add     w14,w14,w19
103         add     w15,w15,w20
104         add     w16,w16,w21
105         eor     w9,w9,w13
106         eor     w10,w10,w14
107         eor     w11,w11,w15
108         eor     w12,w12,w16
109         ror     w9,w9,#20
110         ror     w10,w10,#20
111         ror     w11,w11,#20
112         ror     w12,w12,#20
113         add     w5,w5,w9
114         add     w6,w6,w10
115         add     w7,w7,w11
116         add     w8,w8,w12
117         eor     w17,w17,w5
118         eor     w19,w19,w6
119         eor     w20,w20,w7
120         eor     w21,w21,w8
121         ror     w17,w17,#24
122         ror     w19,w19,#24
123         ror     w20,w20,#24
124         ror     w21,w21,#24
125         add     w13,w13,w17
126         add     w14,w14,w19
127         add     w15,w15,w20
128         add     w16,w16,w21
129         eor     w9,w9,w13
130         eor     w10,w10,w14
131         eor     w11,w11,w15
132         eor     w12,w12,w16
133         ror     w9,w9,#25
134         ror     w10,w10,#25
135         ror     w11,w11,#25
136         ror     w12,w12,#25
137         add     w5,w5,w10
138         add     w6,w6,w11
139         add     w7,w7,w12
140         add     w8,w8,w9
141         eor     w21,w21,w5
142         eor     w17,w17,w6
143         eor     w19,w19,w7
144         eor     w20,w20,w8
145         ror     w21,w21,#16
146         ror     w17,w17,#16
147         ror     w19,w19,#16
148         ror     w20,w20,#16
149         add     w15,w15,w21
150         add     w16,w16,w17
151         add     w13,w13,w19
152         add     w14,w14,w20
153         eor     w10,w10,w15
154         eor     w11,w11,w16
155         eor     w12,w12,w13
156         eor     w9,w9,w14
157         ror     w10,w10,#20
158         ror     w11,w11,#20
159         ror     w12,w12,#20
160         ror     w9,w9,#20
161         add     w5,w5,w10
162         add     w6,w6,w11
163         add     w7,w7,w12
164         add     w8,w8,w9
165         eor     w21,w21,w5
166         eor     w17,w17,w6
167         eor     w19,w19,w7
168         eor     w20,w20,w8
169         ror     w21,w21,#24
170         ror     w17,w17,#24
171         ror     w19,w19,#24
172         ror     w20,w20,#24
173         add     w15,w15,w21
174         add     w16,w16,w17
175         add     w13,w13,w19
176         add     w14,w14,w20
177         eor     w10,w10,w15
178         eor     w11,w11,w16
179         eor     w12,w12,w13
180         eor     w9,w9,w14
181         ror     w10,w10,#25
182         ror     w11,w11,#25
183         ror     w12,w12,#25
184         ror     w9,w9,#25
185         cbnz    x4,.Loop
186
187         add     w5,w5,w22               // accumulate key block
188         add     x6,x6,x22,lsr#32
189         add     w7,w7,w23
190         add     x8,x8,x23,lsr#32
191         add     w9,w9,w24
192         add     x10,x10,x24,lsr#32
193         add     w11,w11,w25
194         add     x12,x12,x25,lsr#32
195         add     w13,w13,w26
196         add     x14,x14,x26,lsr#32
197         add     w15,w15,w27
198         add     x16,x16,x27,lsr#32
199         add     w17,w17,w28
200         add     x19,x19,x28,lsr#32
201         add     w20,w20,w30
202         add     x21,x21,x30,lsr#32
203
204         b.lo    .Ltail
205
206         add     x5,x5,x6,lsl#32 // pack
207         add     x7,x7,x8,lsl#32
208         ldp     x6,x8,[x1,#0]           // load input
209         add     x9,x9,x10,lsl#32
210         add     x11,x11,x12,lsl#32
211         ldp     x10,x12,[x1,#16]
212         add     x13,x13,x14,lsl#32
213         add     x15,x15,x16,lsl#32
214         ldp     x14,x16,[x1,#32]
215         add     x17,x17,x19,lsl#32
216         add     x20,x20,x21,lsl#32
217         ldp     x19,x21,[x1,#48]
218         add     x1,x1,#64
219 #ifdef  __ARMEB__
220         rev     x5,x5
221         rev     x7,x7
222         rev     x9,x9
223         rev     x11,x11
224         rev     x13,x13
225         rev     x15,x15
226         rev     x17,x17
227         rev     x20,x20
228 #endif
229         eor     x5,x5,x6
230         eor     x7,x7,x8
231         eor     x9,x9,x10
232         eor     x11,x11,x12
233         eor     x13,x13,x14
234         eor     x15,x15,x16
235         eor     x17,x17,x19
236         eor     x20,x20,x21
237
238         stp     x5,x7,[x0,#0]           // store output
239         add     x28,x28,#1                      // increment counter
240         stp     x9,x11,[x0,#16]
241         stp     x13,x15,[x0,#32]
242         stp     x17,x20,[x0,#48]
243         add     x0,x0,#64
244
245         b.hi    .Loop_outer
246
247         ldp     x19,x20,[x29,#16]
248         add     sp,sp,#64
249         ldp     x21,x22,[x29,#32]
250         ldp     x23,x24,[x29,#48]
251         ldp     x25,x26,[x29,#64]
252         ldp     x27,x28,[x29,#80]
253         ldp     x29,x30,[sp],#96
254 .inst   0xd50323bf                      // autiasp
255 .Labort:
256         ret
257
258 .align  4
259 .Ltail:
260         add     x2,x2,#64
261 .Less_than_64:
262         sub     x0,x0,#1
263         add     x1,x1,x2
264         add     x0,x0,x2
265         add     x4,sp,x2
266         neg     x2,x2
267
268         add     x5,x5,x6,lsl#32 // pack
269         add     x7,x7,x8,lsl#32
270         add     x9,x9,x10,lsl#32
271         add     x11,x11,x12,lsl#32
272         add     x13,x13,x14,lsl#32
273         add     x15,x15,x16,lsl#32
274         add     x17,x17,x19,lsl#32
275         add     x20,x20,x21,lsl#32
276 #ifdef  __ARMEB__
277         rev     x5,x5
278         rev     x7,x7
279         rev     x9,x9
280         rev     x11,x11
281         rev     x13,x13
282         rev     x15,x15
283         rev     x17,x17
284         rev     x20,x20
285 #endif
286         stp     x5,x7,[sp,#0]
287         stp     x9,x11,[sp,#16]
288         stp     x13,x15,[sp,#32]
289         stp     x17,x20,[sp,#48]
290
291 .Loop_tail:
292         ldrb    w10,[x1,x2]
293         ldrb    w11,[x4,x2]
294         add     x2,x2,#1
295         eor     w10,w10,w11
296         strb    w10,[x0,x2]
297         cbnz    x2,.Loop_tail
298
299         stp     xzr,xzr,[sp,#0]
300         stp     xzr,xzr,[sp,#16]
301         stp     xzr,xzr,[sp,#32]
302         stp     xzr,xzr,[sp,#48]
303
304         ldp     x19,x20,[x29,#16]
305         add     sp,sp,#64
306         ldp     x21,x22,[x29,#32]
307         ldp     x23,x24,[x29,#48]
308         ldp     x25,x26,[x29,#64]
309         ldp     x27,x28,[x29,#80]
310         ldp     x29,x30,[sp],#96
311 .inst   0xd50323bf                      // autiasp
312         ret
313 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
314
315 .type   ChaCha20_neon,%function
316 .align  5
317 ChaCha20_neon:
318 .inst   0xd503233f                      // paciasp
319         stp     x29,x30,[sp,#-96]!
320         add     x29,sp,#0
321
322         adr     x5,.Lsigma
323         stp     x19,x20,[sp,#16]
324         stp     x21,x22,[sp,#32]
325         stp     x23,x24,[sp,#48]
326         stp     x25,x26,[sp,#64]
327         stp     x27,x28,[sp,#80]
328         cmp     x2,#512
329         b.hs    .L512_or_more_neon
330
331         sub     sp,sp,#64
332
333         ldp     x22,x23,[x5]            // load sigma
334         ld1     {v24.4s},[x5],#16
335         ldp     x24,x25,[x3]            // load key
336         ldp     x26,x27,[x3,#16]
337         ld1     {v25.4s,v26.4s},[x3]
338         ldp     x28,x30,[x4]            // load counter
339         ld1     {v27.4s},[x4]
340         ld1     {v31.4s},[x5]
341 #ifdef  __ARMEB__
342         rev64   v24.4s,v24.4s
343         ror     x24,x24,#32
344         ror     x25,x25,#32
345         ror     x26,x26,#32
346         ror     x27,x27,#32
347         ror     x28,x28,#32
348         ror     x30,x30,#32
349 #endif
350         add     v27.4s,v27.4s,v31.4s            // += 1
351         add     v28.4s,v27.4s,v31.4s
352         add     v29.4s,v28.4s,v31.4s
353         shl     v31.4s,v31.4s,#2                        // 1 -> 4
354
355 .Loop_outer_neon:
356         mov     w5,w22                  // unpack key block
357         lsr     x6,x22,#32
358         mov     v0.16b,v24.16b
359         mov     w7,w23
360         lsr     x8,x23,#32
361         mov     v4.16b,v24.16b
362         mov     w9,w24
363         lsr     x10,x24,#32
364         mov     v16.16b,v24.16b
365         mov     w11,w25
366         mov     v1.16b,v25.16b
367         lsr     x12,x25,#32
368         mov     v5.16b,v25.16b
369         mov     w13,w26
370         mov     v17.16b,v25.16b
371         lsr     x14,x26,#32
372         mov     v3.16b,v27.16b
373         mov     w15,w27
374         mov     v7.16b,v28.16b
375         lsr     x16,x27,#32
376         mov     v19.16b,v29.16b
377         mov     w17,w28
378         mov     v2.16b,v26.16b
379         lsr     x19,x28,#32
380         mov     v6.16b,v26.16b
381         mov     w20,w30
382         mov     v18.16b,v26.16b
383         lsr     x21,x30,#32
384
385         mov     x4,#10
386         subs    x2,x2,#256
387 .Loop_neon:
388         sub     x4,x4,#1
389         add     v0.4s,v0.4s,v1.4s
390         add     w5,w5,w9
391         add     v4.4s,v4.4s,v5.4s
392         add     w6,w6,w10
393         add     v16.4s,v16.4s,v17.4s
394         add     w7,w7,w11
395         eor     v3.16b,v3.16b,v0.16b
396         add     w8,w8,w12
397         eor     v7.16b,v7.16b,v4.16b
398         eor     w17,w17,w5
399         eor     v19.16b,v19.16b,v16.16b
400         eor     w19,w19,w6
401         rev32   v3.8h,v3.8h
402         eor     w20,w20,w7
403         rev32   v7.8h,v7.8h
404         eor     w21,w21,w8
405         rev32   v19.8h,v19.8h
406         ror     w17,w17,#16
407         add     v2.4s,v2.4s,v3.4s
408         ror     w19,w19,#16
409         add     v6.4s,v6.4s,v7.4s
410         ror     w20,w20,#16
411         add     v18.4s,v18.4s,v19.4s
412         ror     w21,w21,#16
413         eor     v20.16b,v1.16b,v2.16b
414         add     w13,w13,w17
415         eor     v21.16b,v5.16b,v6.16b
416         add     w14,w14,w19
417         eor     v22.16b,v17.16b,v18.16b
418         add     w15,w15,w20
419         ushr    v1.4s,v20.4s,#20
420         add     w16,w16,w21
421         ushr    v5.4s,v21.4s,#20
422         eor     w9,w9,w13
423         ushr    v17.4s,v22.4s,#20
424         eor     w10,w10,w14
425         sli     v1.4s,v20.4s,#12
426         eor     w11,w11,w15
427         sli     v5.4s,v21.4s,#12
428         eor     w12,w12,w16
429         sli     v17.4s,v22.4s,#12
430         ror     w9,w9,#20
431         add     v0.4s,v0.4s,v1.4s
432         ror     w10,w10,#20
433         add     v4.4s,v4.4s,v5.4s
434         ror     w11,w11,#20
435         add     v16.4s,v16.4s,v17.4s
436         ror     w12,w12,#20
437         eor     v20.16b,v3.16b,v0.16b
438         add     w5,w5,w9
439         eor     v21.16b,v7.16b,v4.16b
440         add     w6,w6,w10
441         eor     v22.16b,v19.16b,v16.16b
442         add     w7,w7,w11
443         ushr    v3.4s,v20.4s,#24
444         add     w8,w8,w12
445         ushr    v7.4s,v21.4s,#24
446         eor     w17,w17,w5
447         ushr    v19.4s,v22.4s,#24
448         eor     w19,w19,w6
449         sli     v3.4s,v20.4s,#8
450         eor     w20,w20,w7
451         sli     v7.4s,v21.4s,#8
452         eor     w21,w21,w8
453         sli     v19.4s,v22.4s,#8
454         ror     w17,w17,#24
455         add     v2.4s,v2.4s,v3.4s
456         ror     w19,w19,#24
457         add     v6.4s,v6.4s,v7.4s
458         ror     w20,w20,#24
459         add     v18.4s,v18.4s,v19.4s
460         ror     w21,w21,#24
461         eor     v20.16b,v1.16b,v2.16b
462         add     w13,w13,w17
463         eor     v21.16b,v5.16b,v6.16b
464         add     w14,w14,w19
465         eor     v22.16b,v17.16b,v18.16b
466         add     w15,w15,w20
467         ushr    v1.4s,v20.4s,#25
468         add     w16,w16,w21
469         ushr    v5.4s,v21.4s,#25
470         eor     w9,w9,w13
471         ushr    v17.4s,v22.4s,#25
472         eor     w10,w10,w14
473         sli     v1.4s,v20.4s,#7
474         eor     w11,w11,w15
475         sli     v5.4s,v21.4s,#7
476         eor     w12,w12,w16
477         sli     v17.4s,v22.4s,#7
478         ror     w9,w9,#25
479         ext     v2.16b,v2.16b,v2.16b,#8
480         ror     w10,w10,#25
481         ext     v6.16b,v6.16b,v6.16b,#8
482         ror     w11,w11,#25
483         ext     v18.16b,v18.16b,v18.16b,#8
484         ror     w12,w12,#25
485         ext     v3.16b,v3.16b,v3.16b,#12
486         ext     v7.16b,v7.16b,v7.16b,#12
487         ext     v19.16b,v19.16b,v19.16b,#12
488         ext     v1.16b,v1.16b,v1.16b,#4
489         ext     v5.16b,v5.16b,v5.16b,#4
490         ext     v17.16b,v17.16b,v17.16b,#4
491         add     v0.4s,v0.4s,v1.4s
492         add     w5,w5,w10
493         add     v4.4s,v4.4s,v5.4s
494         add     w6,w6,w11
495         add     v16.4s,v16.4s,v17.4s
496         add     w7,w7,w12
497         eor     v3.16b,v3.16b,v0.16b
498         add     w8,w8,w9
499         eor     v7.16b,v7.16b,v4.16b
500         eor     w21,w21,w5
501         eor     v19.16b,v19.16b,v16.16b
502         eor     w17,w17,w6
503         rev32   v3.8h,v3.8h
504         eor     w19,w19,w7
505         rev32   v7.8h,v7.8h
506         eor     w20,w20,w8
507         rev32   v19.8h,v19.8h
508         ror     w21,w21,#16
509         add     v2.4s,v2.4s,v3.4s
510         ror     w17,w17,#16
511         add     v6.4s,v6.4s,v7.4s
512         ror     w19,w19,#16
513         add     v18.4s,v18.4s,v19.4s
514         ror     w20,w20,#16
515         eor     v20.16b,v1.16b,v2.16b
516         add     w15,w15,w21
517         eor     v21.16b,v5.16b,v6.16b
518         add     w16,w16,w17
519         eor     v22.16b,v17.16b,v18.16b
520         add     w13,w13,w19
521         ushr    v1.4s,v20.4s,#20
522         add     w14,w14,w20
523         ushr    v5.4s,v21.4s,#20
524         eor     w10,w10,w15
525         ushr    v17.4s,v22.4s,#20
526         eor     w11,w11,w16
527         sli     v1.4s,v20.4s,#12
528         eor     w12,w12,w13
529         sli     v5.4s,v21.4s,#12
530         eor     w9,w9,w14
531         sli     v17.4s,v22.4s,#12
532         ror     w10,w10,#20
533         add     v0.4s,v0.4s,v1.4s
534         ror     w11,w11,#20
535         add     v4.4s,v4.4s,v5.4s
536         ror     w12,w12,#20
537         add     v16.4s,v16.4s,v17.4s
538         ror     w9,w9,#20
539         eor     v20.16b,v3.16b,v0.16b
540         add     w5,w5,w10
541         eor     v21.16b,v7.16b,v4.16b
542         add     w6,w6,w11
543         eor     v22.16b,v19.16b,v16.16b
544         add     w7,w7,w12
545         ushr    v3.4s,v20.4s,#24
546         add     w8,w8,w9
547         ushr    v7.4s,v21.4s,#24
548         eor     w21,w21,w5
549         ushr    v19.4s,v22.4s,#24
550         eor     w17,w17,w6
551         sli     v3.4s,v20.4s,#8
552         eor     w19,w19,w7
553         sli     v7.4s,v21.4s,#8
554         eor     w20,w20,w8
555         sli     v19.4s,v22.4s,#8
556         ror     w21,w21,#24
557         add     v2.4s,v2.4s,v3.4s
558         ror     w17,w17,#24
559         add     v6.4s,v6.4s,v7.4s
560         ror     w19,w19,#24
561         add     v18.4s,v18.4s,v19.4s
562         ror     w20,w20,#24
563         eor     v20.16b,v1.16b,v2.16b
564         add     w15,w15,w21
565         eor     v21.16b,v5.16b,v6.16b
566         add     w16,w16,w17
567         eor     v22.16b,v17.16b,v18.16b
568         add     w13,w13,w19
569         ushr    v1.4s,v20.4s,#25
570         add     w14,w14,w20
571         ushr    v5.4s,v21.4s,#25
572         eor     w10,w10,w15
573         ushr    v17.4s,v22.4s,#25
574         eor     w11,w11,w16
575         sli     v1.4s,v20.4s,#7
576         eor     w12,w12,w13
577         sli     v5.4s,v21.4s,#7
578         eor     w9,w9,w14
579         sli     v17.4s,v22.4s,#7
580         ror     w10,w10,#25
581         ext     v2.16b,v2.16b,v2.16b,#8
582         ror     w11,w11,#25
583         ext     v6.16b,v6.16b,v6.16b,#8
584         ror     w12,w12,#25
585         ext     v18.16b,v18.16b,v18.16b,#8
586         ror     w9,w9,#25
587         ext     v3.16b,v3.16b,v3.16b,#4
588         ext     v7.16b,v7.16b,v7.16b,#4
589         ext     v19.16b,v19.16b,v19.16b,#4
590         ext     v1.16b,v1.16b,v1.16b,#12
591         ext     v5.16b,v5.16b,v5.16b,#12
592         ext     v17.16b,v17.16b,v17.16b,#12
593         cbnz    x4,.Loop_neon
594
595         add     w5,w5,w22               // accumulate key block
596         add     v0.4s,v0.4s,v24.4s
597         add     x6,x6,x22,lsr#32
598         add     v4.4s,v4.4s,v24.4s
599         add     w7,w7,w23
600         add     v16.4s,v16.4s,v24.4s
601         add     x8,x8,x23,lsr#32
602         add     v2.4s,v2.4s,v26.4s
603         add     w9,w9,w24
604         add     v6.4s,v6.4s,v26.4s
605         add     x10,x10,x24,lsr#32
606         add     v18.4s,v18.4s,v26.4s
607         add     w11,w11,w25
608         add     v3.4s,v3.4s,v27.4s
609         add     x12,x12,x25,lsr#32
610         add     w13,w13,w26
611         add     v7.4s,v7.4s,v28.4s
612         add     x14,x14,x26,lsr#32
613         add     w15,w15,w27
614         add     v19.4s,v19.4s,v29.4s
615         add     x16,x16,x27,lsr#32
616         add     w17,w17,w28
617         add     v1.4s,v1.4s,v25.4s
618         add     x19,x19,x28,lsr#32
619         add     w20,w20,w30
620         add     v5.4s,v5.4s,v25.4s
621         add     x21,x21,x30,lsr#32
622         add     v17.4s,v17.4s,v25.4s
623
624         b.lo    .Ltail_neon
625
626         add     x5,x5,x6,lsl#32 // pack
627         add     x7,x7,x8,lsl#32
628         ldp     x6,x8,[x1,#0]           // load input
629         add     x9,x9,x10,lsl#32
630         add     x11,x11,x12,lsl#32
631         ldp     x10,x12,[x1,#16]
632         add     x13,x13,x14,lsl#32
633         add     x15,x15,x16,lsl#32
634         ldp     x14,x16,[x1,#32]
635         add     x17,x17,x19,lsl#32
636         add     x20,x20,x21,lsl#32
637         ldp     x19,x21,[x1,#48]
638         add     x1,x1,#64
639 #ifdef  __ARMEB__
640         rev     x5,x5
641         rev     x7,x7
642         rev     x9,x9
643         rev     x11,x11
644         rev     x13,x13
645         rev     x15,x15
646         rev     x17,x17
647         rev     x20,x20
648 #endif
649         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
650         eor     x5,x5,x6
651         eor     x7,x7,x8
652         eor     x9,x9,x10
653         eor     x11,x11,x12
654         eor     x13,x13,x14
655         eor     v0.16b,v0.16b,v20.16b
656         eor     x15,x15,x16
657         eor     v1.16b,v1.16b,v21.16b
658         eor     x17,x17,x19
659         eor     v2.16b,v2.16b,v22.16b
660         eor     x20,x20,x21
661         eor     v3.16b,v3.16b,v23.16b
662         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
663
664         stp     x5,x7,[x0,#0]           // store output
665         add     x28,x28,#4                      // increment counter
666         stp     x9,x11,[x0,#16]
667         add     v27.4s,v27.4s,v31.4s            // += 4
668         stp     x13,x15,[x0,#32]
669         add     v28.4s,v28.4s,v31.4s
670         stp     x17,x20,[x0,#48]
671         add     v29.4s,v29.4s,v31.4s
672         add     x0,x0,#64
673
674         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
675         ld1     {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
676
677         eor     v4.16b,v4.16b,v20.16b
678         eor     v5.16b,v5.16b,v21.16b
679         eor     v6.16b,v6.16b,v22.16b
680         eor     v7.16b,v7.16b,v23.16b
681         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
682
683         eor     v16.16b,v16.16b,v0.16b
684         eor     v17.16b,v17.16b,v1.16b
685         eor     v18.16b,v18.16b,v2.16b
686         eor     v19.16b,v19.16b,v3.16b
687         st1     {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
688
689         b.hi    .Loop_outer_neon
690
691         ldp     x19,x20,[x29,#16]
692         add     sp,sp,#64
693         ldp     x21,x22,[x29,#32]
694         ldp     x23,x24,[x29,#48]
695         ldp     x25,x26,[x29,#64]
696         ldp     x27,x28,[x29,#80]
697         ldp     x29,x30,[sp],#96
698 .inst   0xd50323bf                      // autiasp
699         ret
700
701 .Ltail_neon:
702         add     x2,x2,#256
703         cmp     x2,#64
704         b.lo    .Less_than_64
705
706         add     x5,x5,x6,lsl#32 // pack
707         add     x7,x7,x8,lsl#32
708         ldp     x6,x8,[x1,#0]           // load input
709         add     x9,x9,x10,lsl#32
710         add     x11,x11,x12,lsl#32
711         ldp     x10,x12,[x1,#16]
712         add     x13,x13,x14,lsl#32
713         add     x15,x15,x16,lsl#32
714         ldp     x14,x16,[x1,#32]
715         add     x17,x17,x19,lsl#32
716         add     x20,x20,x21,lsl#32
717         ldp     x19,x21,[x1,#48]
718         add     x1,x1,#64
719 #ifdef  __ARMEB__
720         rev     x5,x5
721         rev     x7,x7
722         rev     x9,x9
723         rev     x11,x11
724         rev     x13,x13
725         rev     x15,x15
726         rev     x17,x17
727         rev     x20,x20
728 #endif
729         eor     x5,x5,x6
730         eor     x7,x7,x8
731         eor     x9,x9,x10
732         eor     x11,x11,x12
733         eor     x13,x13,x14
734         eor     x15,x15,x16
735         eor     x17,x17,x19
736         eor     x20,x20,x21
737
738         stp     x5,x7,[x0,#0]           // store output
739         add     x28,x28,#4                      // increment counter
740         stp     x9,x11,[x0,#16]
741         stp     x13,x15,[x0,#32]
742         stp     x17,x20,[x0,#48]
743         add     x0,x0,#64
744         b.eq    .Ldone_neon
745         sub     x2,x2,#64
746         cmp     x2,#64
747         b.lo    .Less_than_128
748
749         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
750         eor     v0.16b,v0.16b,v20.16b
751         eor     v1.16b,v1.16b,v21.16b
752         eor     v2.16b,v2.16b,v22.16b
753         eor     v3.16b,v3.16b,v23.16b
754         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
755         b.eq    .Ldone_neon
756         sub     x2,x2,#64
757         cmp     x2,#64
758         b.lo    .Less_than_192
759
760         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
761         eor     v4.16b,v4.16b,v20.16b
762         eor     v5.16b,v5.16b,v21.16b
763         eor     v6.16b,v6.16b,v22.16b
764         eor     v7.16b,v7.16b,v23.16b
765         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
766         b.eq    .Ldone_neon
767         sub     x2,x2,#64
768
769         st1     {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
770         b       .Last_neon
771
772 .Less_than_128:
773         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
774         b       .Last_neon
775 .Less_than_192:
776         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
777         b       .Last_neon
778
779 .align  4
780 .Last_neon:
781         sub     x0,x0,#1
782         add     x1,x1,x2
783         add     x0,x0,x2
784         add     x4,sp,x2
785         neg     x2,x2
786
787 .Loop_tail_neon:
788         ldrb    w10,[x1,x2]
789         ldrb    w11,[x4,x2]
790         add     x2,x2,#1
791         eor     w10,w10,w11
792         strb    w10,[x0,x2]
793         cbnz    x2,.Loop_tail_neon
794
795         stp     xzr,xzr,[sp,#0]
796         stp     xzr,xzr,[sp,#16]
797         stp     xzr,xzr,[sp,#32]
798         stp     xzr,xzr,[sp,#48]
799
800 .Ldone_neon:
801         ldp     x19,x20,[x29,#16]
802         add     sp,sp,#64
803         ldp     x21,x22,[x29,#32]
804         ldp     x23,x24,[x29,#48]
805         ldp     x25,x26,[x29,#64]
806         ldp     x27,x28,[x29,#80]
807         ldp     x29,x30,[sp],#96
808 .inst   0xd50323bf                      // autiasp
809         ret
810 .size   ChaCha20_neon,.-ChaCha20_neon
811 .type   ChaCha20_512_neon,%function
812 .align  5
813 ChaCha20_512_neon:
814 .inst   0xd503233f                      // paciasp
815         stp     x29,x30,[sp,#-96]!
816         add     x29,sp,#0
817
818         adr     x5,.Lsigma
819         stp     x19,x20,[sp,#16]
820         stp     x21,x22,[sp,#32]
821         stp     x23,x24,[sp,#48]
822         stp     x25,x26,[sp,#64]
823         stp     x27,x28,[sp,#80]
824
825 .L512_or_more_neon:
826         sub     sp,sp,#128+64
827
828         ldp     x22,x23,[x5]            // load sigma
829         ld1     {v24.4s},[x5],#16
830         ldp     x24,x25,[x3]            // load key
831         ldp     x26,x27,[x3,#16]
832         ld1     {v25.4s,v26.4s},[x3]
833         ldp     x28,x30,[x4]            // load counter
834         ld1     {v27.4s},[x4]
835         ld1     {v31.4s},[x5]
836 #ifdef  __ARMEB__
837         rev64   v24.4s,v24.4s
838         ror     x24,x24,#32
839         ror     x25,x25,#32
840         ror     x26,x26,#32
841         ror     x27,x27,#32
842         ror     x28,x28,#32
843         ror     x30,x30,#32
844 #endif
845         add     v27.4s,v27.4s,v31.4s            // += 1
846         stp     q24,q25,[sp,#0]         // off-load key block, invariant part
847         add     v27.4s,v27.4s,v31.4s            // not typo
848         str     q26,[sp,#32]
849         add     v28.4s,v27.4s,v31.4s
850         add     v29.4s,v28.4s,v31.4s
851         add     v30.4s,v29.4s,v31.4s
852         shl     v31.4s,v31.4s,#2                        // 1 -> 4
853
854         stp     d8,d9,[sp,#128+0]               // meet ABI requirements
855         stp     d10,d11,[sp,#128+16]
856         stp     d12,d13,[sp,#128+32]
857         stp     d14,d15,[sp,#128+48]
858
859         sub     x2,x2,#512                      // not typo
860
861 .Loop_outer_512_neon:
862         mov     v0.16b,v24.16b
863         mov     v4.16b,v24.16b
864         mov     v8.16b,v24.16b
865         mov     v12.16b,v24.16b
866         mov     v16.16b,v24.16b
867         mov     v20.16b,v24.16b
868         mov     v1.16b,v25.16b
869         mov     w5,w22                  // unpack key block
870         mov     v5.16b,v25.16b
871         lsr     x6,x22,#32
872         mov     v9.16b,v25.16b
873         mov     w7,w23
874         mov     v13.16b,v25.16b
875         lsr     x8,x23,#32
876         mov     v17.16b,v25.16b
877         mov     w9,w24
878         mov     v21.16b,v25.16b
879         lsr     x10,x24,#32
880         mov     v3.16b,v27.16b
881         mov     w11,w25
882         mov     v7.16b,v28.16b
883         lsr     x12,x25,#32
884         mov     v11.16b,v29.16b
885         mov     w13,w26
886         mov     v15.16b,v30.16b
887         lsr     x14,x26,#32
888         mov     v2.16b,v26.16b
889         mov     w15,w27
890         mov     v6.16b,v26.16b
891         lsr     x16,x27,#32
892         add     v19.4s,v3.4s,v31.4s                     // +4
893         mov     w17,w28
894         add     v23.4s,v7.4s,v31.4s                     // +4
895         lsr     x19,x28,#32
896         mov     v10.16b,v26.16b
897         mov     w20,w30
898         mov     v14.16b,v26.16b
899         lsr     x21,x30,#32
900         mov     v18.16b,v26.16b
901         stp     q27,q28,[sp,#48]                // off-load key block, variable part
902         mov     v22.16b,v26.16b
903         str     q29,[sp,#80]
904
905         mov     x4,#5
906         subs    x2,x2,#512
907 .Loop_upper_neon:
908         sub     x4,x4,#1
909         add     v0.4s,v0.4s,v1.4s
910         add     w5,w5,w9
911         add     v4.4s,v4.4s,v5.4s
912         add     w6,w6,w10
913         add     v8.4s,v8.4s,v9.4s
914         add     w7,w7,w11
915         add     v12.4s,v12.4s,v13.4s
916         add     w8,w8,w12
917         add     v16.4s,v16.4s,v17.4s
918         eor     w17,w17,w5
919         add     v20.4s,v20.4s,v21.4s
920         eor     w19,w19,w6
921         eor     v3.16b,v3.16b,v0.16b
922         eor     w20,w20,w7
923         eor     v7.16b,v7.16b,v4.16b
924         eor     w21,w21,w8
925         eor     v11.16b,v11.16b,v8.16b
926         ror     w17,w17,#16
927         eor     v15.16b,v15.16b,v12.16b
928         ror     w19,w19,#16
929         eor     v19.16b,v19.16b,v16.16b
930         ror     w20,w20,#16
931         eor     v23.16b,v23.16b,v20.16b
932         ror     w21,w21,#16
933         rev32   v3.8h,v3.8h
934         add     w13,w13,w17
935         rev32   v7.8h,v7.8h
936         add     w14,w14,w19
937         rev32   v11.8h,v11.8h
938         add     w15,w15,w20
939         rev32   v15.8h,v15.8h
940         add     w16,w16,w21
941         rev32   v19.8h,v19.8h
942         eor     w9,w9,w13
943         rev32   v23.8h,v23.8h
944         eor     w10,w10,w14
945         add     v2.4s,v2.4s,v3.4s
946         eor     w11,w11,w15
947         add     v6.4s,v6.4s,v7.4s
948         eor     w12,w12,w16
949         add     v10.4s,v10.4s,v11.4s
950         ror     w9,w9,#20
951         add     v14.4s,v14.4s,v15.4s
952         ror     w10,w10,#20
953         add     v18.4s,v18.4s,v19.4s
954         ror     w11,w11,#20
955         add     v22.4s,v22.4s,v23.4s
956         ror     w12,w12,#20
957         eor     v24.16b,v1.16b,v2.16b
958         add     w5,w5,w9
959         eor     v25.16b,v5.16b,v6.16b
960         add     w6,w6,w10
961         eor     v26.16b,v9.16b,v10.16b
962         add     w7,w7,w11
963         eor     v27.16b,v13.16b,v14.16b
964         add     w8,w8,w12
965         eor     v28.16b,v17.16b,v18.16b
966         eor     w17,w17,w5
967         eor     v29.16b,v21.16b,v22.16b
968         eor     w19,w19,w6
969         ushr    v1.4s,v24.4s,#20
970         eor     w20,w20,w7
971         ushr    v5.4s,v25.4s,#20
972         eor     w21,w21,w8
973         ushr    v9.4s,v26.4s,#20
974         ror     w17,w17,#24
975         ushr    v13.4s,v27.4s,#20
976         ror     w19,w19,#24
977         ushr    v17.4s,v28.4s,#20
978         ror     w20,w20,#24
979         ushr    v21.4s,v29.4s,#20
980         ror     w21,w21,#24
981         sli     v1.4s,v24.4s,#12
982         add     w13,w13,w17
983         sli     v5.4s,v25.4s,#12
984         add     w14,w14,w19
985         sli     v9.4s,v26.4s,#12
986         add     w15,w15,w20
987         sli     v13.4s,v27.4s,#12
988         add     w16,w16,w21
989         sli     v17.4s,v28.4s,#12
990         eor     w9,w9,w13
991         sli     v21.4s,v29.4s,#12
992         eor     w10,w10,w14
993         add     v0.4s,v0.4s,v1.4s
994         eor     w11,w11,w15
995         add     v4.4s,v4.4s,v5.4s
996         eor     w12,w12,w16
997         add     v8.4s,v8.4s,v9.4s
998         ror     w9,w9,#25
999         add     v12.4s,v12.4s,v13.4s
1000         ror     w10,w10,#25
1001         add     v16.4s,v16.4s,v17.4s
1002         ror     w11,w11,#25
1003         add     v20.4s,v20.4s,v21.4s
1004         ror     w12,w12,#25
1005         eor     v24.16b,v3.16b,v0.16b
1006         add     w5,w5,w10
1007         eor     v25.16b,v7.16b,v4.16b
1008         add     w6,w6,w11
1009         eor     v26.16b,v11.16b,v8.16b
1010         add     w7,w7,w12
1011         eor     v27.16b,v15.16b,v12.16b
1012         add     w8,w8,w9
1013         eor     v28.16b,v19.16b,v16.16b
1014         eor     w21,w21,w5
1015         eor     v29.16b,v23.16b,v20.16b
1016         eor     w17,w17,w6
1017         ushr    v3.4s,v24.4s,#24
1018         eor     w19,w19,w7
1019         ushr    v7.4s,v25.4s,#24
1020         eor     w20,w20,w8
1021         ushr    v11.4s,v26.4s,#24
1022         ror     w21,w21,#16
1023         ushr    v15.4s,v27.4s,#24
1024         ror     w17,w17,#16
1025         ushr    v19.4s,v28.4s,#24
1026         ror     w19,w19,#16
1027         ushr    v23.4s,v29.4s,#24
1028         ror     w20,w20,#16
1029         sli     v3.4s,v24.4s,#8
1030         add     w15,w15,w21
1031         sli     v7.4s,v25.4s,#8
1032         add     w16,w16,w17
1033         sli     v11.4s,v26.4s,#8
1034         add     w13,w13,w19
1035         sli     v15.4s,v27.4s,#8
1036         add     w14,w14,w20
1037         sli     v19.4s,v28.4s,#8
1038         eor     w10,w10,w15
1039         sli     v23.4s,v29.4s,#8
1040         eor     w11,w11,w16
1041         add     v2.4s,v2.4s,v3.4s
1042         eor     w12,w12,w13
1043         add     v6.4s,v6.4s,v7.4s
1044         eor     w9,w9,w14
1045         add     v10.4s,v10.4s,v11.4s
1046         ror     w10,w10,#20
1047         add     v14.4s,v14.4s,v15.4s
1048         ror     w11,w11,#20
1049         add     v18.4s,v18.4s,v19.4s
1050         ror     w12,w12,#20
1051         add     v22.4s,v22.4s,v23.4s
1052         ror     w9,w9,#20
1053         eor     v24.16b,v1.16b,v2.16b
1054         add     w5,w5,w10
1055         eor     v25.16b,v5.16b,v6.16b
1056         add     w6,w6,w11
1057         eor     v26.16b,v9.16b,v10.16b
1058         add     w7,w7,w12
1059         eor     v27.16b,v13.16b,v14.16b
1060         add     w8,w8,w9
1061         eor     v28.16b,v17.16b,v18.16b
1062         eor     w21,w21,w5
1063         eor     v29.16b,v21.16b,v22.16b
1064         eor     w17,w17,w6
1065         ushr    v1.4s,v24.4s,#25
1066         eor     w19,w19,w7
1067         ushr    v5.4s,v25.4s,#25
1068         eor     w20,w20,w8
1069         ushr    v9.4s,v26.4s,#25
1070         ror     w21,w21,#24
1071         ushr    v13.4s,v27.4s,#25
1072         ror     w17,w17,#24
1073         ushr    v17.4s,v28.4s,#25
1074         ror     w19,w19,#24
1075         ushr    v21.4s,v29.4s,#25
1076         ror     w20,w20,#24
1077         sli     v1.4s,v24.4s,#7
1078         add     w15,w15,w21
1079         sli     v5.4s,v25.4s,#7
1080         add     w16,w16,w17
1081         sli     v9.4s,v26.4s,#7
1082         add     w13,w13,w19
1083         sli     v13.4s,v27.4s,#7
1084         add     w14,w14,w20
1085         sli     v17.4s,v28.4s,#7
1086         eor     w10,w10,w15
1087         sli     v21.4s,v29.4s,#7
1088         eor     w11,w11,w16
1089         ext     v2.16b,v2.16b,v2.16b,#8
1090         eor     w12,w12,w13
1091         ext     v6.16b,v6.16b,v6.16b,#8
1092         eor     w9,w9,w14
1093         ext     v10.16b,v10.16b,v10.16b,#8
1094         ror     w10,w10,#25
1095         ext     v14.16b,v14.16b,v14.16b,#8
1096         ror     w11,w11,#25
1097         ext     v18.16b,v18.16b,v18.16b,#8
1098         ror     w12,w12,#25
1099         ext     v22.16b,v22.16b,v22.16b,#8
1100         ror     w9,w9,#25
1101         ext     v3.16b,v3.16b,v3.16b,#12
1102         ext     v7.16b,v7.16b,v7.16b,#12
1103         ext     v11.16b,v11.16b,v11.16b,#12
1104         ext     v15.16b,v15.16b,v15.16b,#12
1105         ext     v19.16b,v19.16b,v19.16b,#12
1106         ext     v23.16b,v23.16b,v23.16b,#12
1107         ext     v1.16b,v1.16b,v1.16b,#4
1108         ext     v5.16b,v5.16b,v5.16b,#4
1109         ext     v9.16b,v9.16b,v9.16b,#4
1110         ext     v13.16b,v13.16b,v13.16b,#4
1111         ext     v17.16b,v17.16b,v17.16b,#4
1112         ext     v21.16b,v21.16b,v21.16b,#4
1113         add     v0.4s,v0.4s,v1.4s
1114         add     w5,w5,w9
1115         add     v4.4s,v4.4s,v5.4s
1116         add     w6,w6,w10
1117         add     v8.4s,v8.4s,v9.4s
1118         add     w7,w7,w11
1119         add     v12.4s,v12.4s,v13.4s
1120         add     w8,w8,w12
1121         add     v16.4s,v16.4s,v17.4s
1122         eor     w17,w17,w5
1123         add     v20.4s,v20.4s,v21.4s
1124         eor     w19,w19,w6
1125         eor     v3.16b,v3.16b,v0.16b
1126         eor     w20,w20,w7
1127         eor     v7.16b,v7.16b,v4.16b
1128         eor     w21,w21,w8
1129         eor     v11.16b,v11.16b,v8.16b
1130         ror     w17,w17,#16
1131         eor     v15.16b,v15.16b,v12.16b
1132         ror     w19,w19,#16
1133         eor     v19.16b,v19.16b,v16.16b
1134         ror     w20,w20,#16
1135         eor     v23.16b,v23.16b,v20.16b
1136         ror     w21,w21,#16
1137         rev32   v3.8h,v3.8h
1138         add     w13,w13,w17
1139         rev32   v7.8h,v7.8h
1140         add     w14,w14,w19
1141         rev32   v11.8h,v11.8h
1142         add     w15,w15,w20
1143         rev32   v15.8h,v15.8h
1144         add     w16,w16,w21
1145         rev32   v19.8h,v19.8h
1146         eor     w9,w9,w13
1147         rev32   v23.8h,v23.8h
1148         eor     w10,w10,w14
1149         add     v2.4s,v2.4s,v3.4s
1150         eor     w11,w11,w15
1151         add     v6.4s,v6.4s,v7.4s
1152         eor     w12,w12,w16
1153         add     v10.4s,v10.4s,v11.4s
1154         ror     w9,w9,#20
1155         add     v14.4s,v14.4s,v15.4s
1156         ror     w10,w10,#20
1157         add     v18.4s,v18.4s,v19.4s
1158         ror     w11,w11,#20
1159         add     v22.4s,v22.4s,v23.4s
1160         ror     w12,w12,#20
1161         eor     v24.16b,v1.16b,v2.16b
1162         add     w5,w5,w9
1163         eor     v25.16b,v5.16b,v6.16b
1164         add     w6,w6,w10
1165         eor     v26.16b,v9.16b,v10.16b
1166         add     w7,w7,w11
1167         eor     v27.16b,v13.16b,v14.16b
1168         add     w8,w8,w12
1169         eor     v28.16b,v17.16b,v18.16b
1170         eor     w17,w17,w5
1171         eor     v29.16b,v21.16b,v22.16b
1172         eor     w19,w19,w6
1173         ushr    v1.4s,v24.4s,#20
1174         eor     w20,w20,w7
1175         ushr    v5.4s,v25.4s,#20
1176         eor     w21,w21,w8
1177         ushr    v9.4s,v26.4s,#20
1178         ror     w17,w17,#24
1179         ushr    v13.4s,v27.4s,#20
1180         ror     w19,w19,#24
1181         ushr    v17.4s,v28.4s,#20
1182         ror     w20,w20,#24
1183         ushr    v21.4s,v29.4s,#20
1184         ror     w21,w21,#24
1185         sli     v1.4s,v24.4s,#12
1186         add     w13,w13,w17
1187         sli     v5.4s,v25.4s,#12
1188         add     w14,w14,w19
1189         sli     v9.4s,v26.4s,#12
1190         add     w15,w15,w20
1191         sli     v13.4s,v27.4s,#12
1192         add     w16,w16,w21
1193         sli     v17.4s,v28.4s,#12
1194         eor     w9,w9,w13
1195         sli     v21.4s,v29.4s,#12
1196         eor     w10,w10,w14
1197         add     v0.4s,v0.4s,v1.4s
1198         eor     w11,w11,w15
1199         add     v4.4s,v4.4s,v5.4s
1200         eor     w12,w12,w16
1201         add     v8.4s,v8.4s,v9.4s
1202         ror     w9,w9,#25
1203         add     v12.4s,v12.4s,v13.4s
1204         ror     w10,w10,#25
1205         add     v16.4s,v16.4s,v17.4s
1206         ror     w11,w11,#25
1207         add     v20.4s,v20.4s,v21.4s
1208         ror     w12,w12,#25
1209         eor     v24.16b,v3.16b,v0.16b
1210         add     w5,w5,w10
1211         eor     v25.16b,v7.16b,v4.16b
1212         add     w6,w6,w11
1213         eor     v26.16b,v11.16b,v8.16b
1214         add     w7,w7,w12
1215         eor     v27.16b,v15.16b,v12.16b
1216         add     w8,w8,w9
1217         eor     v28.16b,v19.16b,v16.16b
1218         eor     w21,w21,w5
1219         eor     v29.16b,v23.16b,v20.16b
1220         eor     w17,w17,w6
1221         ushr    v3.4s,v24.4s,#24
1222         eor     w19,w19,w7
1223         ushr    v7.4s,v25.4s,#24
1224         eor     w20,w20,w8
1225         ushr    v11.4s,v26.4s,#24
1226         ror     w21,w21,#16
1227         ushr    v15.4s,v27.4s,#24
1228         ror     w17,w17,#16
1229         ushr    v19.4s,v28.4s,#24
1230         ror     w19,w19,#16
1231         ushr    v23.4s,v29.4s,#24
1232         ror     w20,w20,#16
1233         sli     v3.4s,v24.4s,#8
1234         add     w15,w15,w21
1235         sli     v7.4s,v25.4s,#8
1236         add     w16,w16,w17
1237         sli     v11.4s,v26.4s,#8
1238         add     w13,w13,w19
1239         sli     v15.4s,v27.4s,#8
1240         add     w14,w14,w20
1241         sli     v19.4s,v28.4s,#8
1242         eor     w10,w10,w15
1243         sli     v23.4s,v29.4s,#8
1244         eor     w11,w11,w16
1245         add     v2.4s,v2.4s,v3.4s
1246         eor     w12,w12,w13
1247         add     v6.4s,v6.4s,v7.4s
1248         eor     w9,w9,w14
1249         add     v10.4s,v10.4s,v11.4s
1250         ror     w10,w10,#20
1251         add     v14.4s,v14.4s,v15.4s
1252         ror     w11,w11,#20
1253         add     v18.4s,v18.4s,v19.4s
1254         ror     w12,w12,#20
1255         add     v22.4s,v22.4s,v23.4s
1256         ror     w9,w9,#20
1257         eor     v24.16b,v1.16b,v2.16b
1258         add     w5,w5,w10
1259         eor     v25.16b,v5.16b,v6.16b
1260         add     w6,w6,w11
1261         eor     v26.16b,v9.16b,v10.16b
1262         add     w7,w7,w12
1263         eor     v27.16b,v13.16b,v14.16b
1264         add     w8,w8,w9
1265         eor     v28.16b,v17.16b,v18.16b
1266         eor     w21,w21,w5
1267         eor     v29.16b,v21.16b,v22.16b
1268         eor     w17,w17,w6
1269         ushr    v1.4s,v24.4s,#25
1270         eor     w19,w19,w7
1271         ushr    v5.4s,v25.4s,#25
1272         eor     w20,w20,w8
1273         ushr    v9.4s,v26.4s,#25
1274         ror     w21,w21,#24
1275         ushr    v13.4s,v27.4s,#25
1276         ror     w17,w17,#24
1277         ushr    v17.4s,v28.4s,#25
1278         ror     w19,w19,#24
1279         ushr    v21.4s,v29.4s,#25
1280         ror     w20,w20,#24
1281         sli     v1.4s,v24.4s,#7
1282         add     w15,w15,w21
1283         sli     v5.4s,v25.4s,#7
1284         add     w16,w16,w17
1285         sli     v9.4s,v26.4s,#7
1286         add     w13,w13,w19
1287         sli     v13.4s,v27.4s,#7
1288         add     w14,w14,w20
1289         sli     v17.4s,v28.4s,#7
1290         eor     w10,w10,w15
1291         sli     v21.4s,v29.4s,#7
1292         eor     w11,w11,w16
1293         ext     v2.16b,v2.16b,v2.16b,#8
1294         eor     w12,w12,w13
1295         ext     v6.16b,v6.16b,v6.16b,#8
1296         eor     w9,w9,w14
1297         ext     v10.16b,v10.16b,v10.16b,#8
1298         ror     w10,w10,#25
1299         ext     v14.16b,v14.16b,v14.16b,#8
1300         ror     w11,w11,#25
1301         ext     v18.16b,v18.16b,v18.16b,#8
1302         ror     w12,w12,#25
1303         ext     v22.16b,v22.16b,v22.16b,#8
1304         ror     w9,w9,#25
1305         ext     v3.16b,v3.16b,v3.16b,#4
1306         ext     v7.16b,v7.16b,v7.16b,#4
1307         ext     v11.16b,v11.16b,v11.16b,#4
1308         ext     v15.16b,v15.16b,v15.16b,#4
1309         ext     v19.16b,v19.16b,v19.16b,#4
1310         ext     v23.16b,v23.16b,v23.16b,#4
1311         ext     v1.16b,v1.16b,v1.16b,#12
1312         ext     v5.16b,v5.16b,v5.16b,#12
1313         ext     v9.16b,v9.16b,v9.16b,#12
1314         ext     v13.16b,v13.16b,v13.16b,#12
1315         ext     v17.16b,v17.16b,v17.16b,#12
1316         ext     v21.16b,v21.16b,v21.16b,#12
1317         cbnz    x4,.Loop_upper_neon
1318
1319         add     w5,w5,w22               // accumulate key block
1320         add     x6,x6,x22,lsr#32
1321         add     w7,w7,w23
1322         add     x8,x8,x23,lsr#32
1323         add     w9,w9,w24
1324         add     x10,x10,x24,lsr#32
1325         add     w11,w11,w25
1326         add     x12,x12,x25,lsr#32
1327         add     w13,w13,w26
1328         add     x14,x14,x26,lsr#32
1329         add     w15,w15,w27
1330         add     x16,x16,x27,lsr#32
1331         add     w17,w17,w28
1332         add     x19,x19,x28,lsr#32
1333         add     w20,w20,w30
1334         add     x21,x21,x30,lsr#32
1335
1336         add     x5,x5,x6,lsl#32 // pack
1337         add     x7,x7,x8,lsl#32
1338         ldp     x6,x8,[x1,#0]           // load input
1339         add     x9,x9,x10,lsl#32
1340         add     x11,x11,x12,lsl#32
1341         ldp     x10,x12,[x1,#16]
1342         add     x13,x13,x14,lsl#32
1343         add     x15,x15,x16,lsl#32
1344         ldp     x14,x16,[x1,#32]
1345         add     x17,x17,x19,lsl#32
1346         add     x20,x20,x21,lsl#32
1347         ldp     x19,x21,[x1,#48]
1348         add     x1,x1,#64
1349 #ifdef  __ARMEB__
1350         rev     x5,x5
1351         rev     x7,x7
1352         rev     x9,x9
1353         rev     x11,x11
1354         rev     x13,x13
1355         rev     x15,x15
1356         rev     x17,x17
1357         rev     x20,x20
1358 #endif
1359         eor     x5,x5,x6
1360         eor     x7,x7,x8
1361         eor     x9,x9,x10
1362         eor     x11,x11,x12
1363         eor     x13,x13,x14
1364         eor     x15,x15,x16
1365         eor     x17,x17,x19
1366         eor     x20,x20,x21
1367
1368         stp     x5,x7,[x0,#0]           // store output
1369         add     x28,x28,#1                      // increment counter
1370         mov     w5,w22                  // unpack key block
1371         lsr     x6,x22,#32
1372         stp     x9,x11,[x0,#16]
1373         mov     w7,w23
1374         lsr     x8,x23,#32
1375         stp     x13,x15,[x0,#32]
1376         mov     w9,w24
1377         lsr     x10,x24,#32
1378         stp     x17,x20,[x0,#48]
1379         add     x0,x0,#64
1380         mov     w11,w25
1381         lsr     x12,x25,#32
1382         mov     w13,w26
1383         lsr     x14,x26,#32
1384         mov     w15,w27
1385         lsr     x16,x27,#32
1386         mov     w17,w28
1387         lsr     x19,x28,#32
1388         mov     w20,w30
1389         lsr     x21,x30,#32
1390
1391         mov     x4,#5
1392 .Loop_lower_neon:
1393         sub     x4,x4,#1
1394         add     v0.4s,v0.4s,v1.4s
1395         add     w5,w5,w9
1396         add     v4.4s,v4.4s,v5.4s
1397         add     w6,w6,w10
1398         add     v8.4s,v8.4s,v9.4s
1399         add     w7,w7,w11
1400         add     v12.4s,v12.4s,v13.4s
1401         add     w8,w8,w12
1402         add     v16.4s,v16.4s,v17.4s
1403         eor     w17,w17,w5
1404         add     v20.4s,v20.4s,v21.4s
1405         eor     w19,w19,w6
1406         eor     v3.16b,v3.16b,v0.16b
1407         eor     w20,w20,w7
1408         eor     v7.16b,v7.16b,v4.16b
1409         eor     w21,w21,w8
1410         eor     v11.16b,v11.16b,v8.16b
1411         ror     w17,w17,#16
1412         eor     v15.16b,v15.16b,v12.16b
1413         ror     w19,w19,#16
1414         eor     v19.16b,v19.16b,v16.16b
1415         ror     w20,w20,#16
1416         eor     v23.16b,v23.16b,v20.16b
1417         ror     w21,w21,#16
1418         rev32   v3.8h,v3.8h
1419         add     w13,w13,w17
1420         rev32   v7.8h,v7.8h
1421         add     w14,w14,w19
1422         rev32   v11.8h,v11.8h
1423         add     w15,w15,w20
1424         rev32   v15.8h,v15.8h
1425         add     w16,w16,w21
1426         rev32   v19.8h,v19.8h
1427         eor     w9,w9,w13
1428         rev32   v23.8h,v23.8h
1429         eor     w10,w10,w14
1430         add     v2.4s,v2.4s,v3.4s
1431         eor     w11,w11,w15
1432         add     v6.4s,v6.4s,v7.4s
1433         eor     w12,w12,w16
1434         add     v10.4s,v10.4s,v11.4s
1435         ror     w9,w9,#20
1436         add     v14.4s,v14.4s,v15.4s
1437         ror     w10,w10,#20
1438         add     v18.4s,v18.4s,v19.4s
1439         ror     w11,w11,#20
1440         add     v22.4s,v22.4s,v23.4s
1441         ror     w12,w12,#20
1442         eor     v24.16b,v1.16b,v2.16b
1443         add     w5,w5,w9
1444         eor     v25.16b,v5.16b,v6.16b
1445         add     w6,w6,w10
1446         eor     v26.16b,v9.16b,v10.16b
1447         add     w7,w7,w11
1448         eor     v27.16b,v13.16b,v14.16b
1449         add     w8,w8,w12
1450         eor     v28.16b,v17.16b,v18.16b
1451         eor     w17,w17,w5
1452         eor     v29.16b,v21.16b,v22.16b
1453         eor     w19,w19,w6
1454         ushr    v1.4s,v24.4s,#20
1455         eor     w20,w20,w7
1456         ushr    v5.4s,v25.4s,#20
1457         eor     w21,w21,w8
1458         ushr    v9.4s,v26.4s,#20
1459         ror     w17,w17,#24
1460         ushr    v13.4s,v27.4s,#20
1461         ror     w19,w19,#24
1462         ushr    v17.4s,v28.4s,#20
1463         ror     w20,w20,#24
1464         ushr    v21.4s,v29.4s,#20
1465         ror     w21,w21,#24
1466         sli     v1.4s,v24.4s,#12
1467         add     w13,w13,w17
1468         sli     v5.4s,v25.4s,#12
1469         add     w14,w14,w19
1470         sli     v9.4s,v26.4s,#12
1471         add     w15,w15,w20
1472         sli     v13.4s,v27.4s,#12
1473         add     w16,w16,w21
1474         sli     v17.4s,v28.4s,#12
1475         eor     w9,w9,w13
1476         sli     v21.4s,v29.4s,#12
1477         eor     w10,w10,w14
1478         add     v0.4s,v0.4s,v1.4s
1479         eor     w11,w11,w15
1480         add     v4.4s,v4.4s,v5.4s
1481         eor     w12,w12,w16
1482         add     v8.4s,v8.4s,v9.4s
1483         ror     w9,w9,#25
1484         add     v12.4s,v12.4s,v13.4s
1485         ror     w10,w10,#25
1486         add     v16.4s,v16.4s,v17.4s
1487         ror     w11,w11,#25
1488         add     v20.4s,v20.4s,v21.4s
1489         ror     w12,w12,#25
1490         eor     v24.16b,v3.16b,v0.16b
1491         add     w5,w5,w10
1492         eor     v25.16b,v7.16b,v4.16b
1493         add     w6,w6,w11
1494         eor     v26.16b,v11.16b,v8.16b
1495         add     w7,w7,w12
1496         eor     v27.16b,v15.16b,v12.16b
1497         add     w8,w8,w9
1498         eor     v28.16b,v19.16b,v16.16b
1499         eor     w21,w21,w5
1500         eor     v29.16b,v23.16b,v20.16b
1501         eor     w17,w17,w6
1502         ushr    v3.4s,v24.4s,#24
1503         eor     w19,w19,w7
1504         ushr    v7.4s,v25.4s,#24
1505         eor     w20,w20,w8
1506         ushr    v11.4s,v26.4s,#24
1507         ror     w21,w21,#16
1508         ushr    v15.4s,v27.4s,#24
1509         ror     w17,w17,#16
1510         ushr    v19.4s,v28.4s,#24
1511         ror     w19,w19,#16
1512         ushr    v23.4s,v29.4s,#24
1513         ror     w20,w20,#16
1514         sli     v3.4s,v24.4s,#8
1515         add     w15,w15,w21
1516         sli     v7.4s,v25.4s,#8
1517         add     w16,w16,w17
1518         sli     v11.4s,v26.4s,#8
1519         add     w13,w13,w19
1520         sli     v15.4s,v27.4s,#8
1521         add     w14,w14,w20
1522         sli     v19.4s,v28.4s,#8
1523         eor     w10,w10,w15
1524         sli     v23.4s,v29.4s,#8
1525         eor     w11,w11,w16
1526         add     v2.4s,v2.4s,v3.4s
1527         eor     w12,w12,w13
1528         add     v6.4s,v6.4s,v7.4s
1529         eor     w9,w9,w14
1530         add     v10.4s,v10.4s,v11.4s
1531         ror     w10,w10,#20
1532         add     v14.4s,v14.4s,v15.4s
1533         ror     w11,w11,#20
1534         add     v18.4s,v18.4s,v19.4s
1535         ror     w12,w12,#20
1536         add     v22.4s,v22.4s,v23.4s
1537         ror     w9,w9,#20
1538         eor     v24.16b,v1.16b,v2.16b
1539         add     w5,w5,w10
1540         eor     v25.16b,v5.16b,v6.16b
1541         add     w6,w6,w11
1542         eor     v26.16b,v9.16b,v10.16b
1543         add     w7,w7,w12
1544         eor     v27.16b,v13.16b,v14.16b
1545         add     w8,w8,w9
1546         eor     v28.16b,v17.16b,v18.16b
1547         eor     w21,w21,w5
1548         eor     v29.16b,v21.16b,v22.16b
1549         eor     w17,w17,w6
1550         ushr    v1.4s,v24.4s,#25
1551         eor     w19,w19,w7
1552         ushr    v5.4s,v25.4s,#25
1553         eor     w20,w20,w8
1554         ushr    v9.4s,v26.4s,#25
1555         ror     w21,w21,#24
1556         ushr    v13.4s,v27.4s,#25
1557         ror     w17,w17,#24
1558         ushr    v17.4s,v28.4s,#25
1559         ror     w19,w19,#24
1560         ushr    v21.4s,v29.4s,#25
1561         ror     w20,w20,#24
1562         sli     v1.4s,v24.4s,#7
1563         add     w15,w15,w21
1564         sli     v5.4s,v25.4s,#7
1565         add     w16,w16,w17
1566         sli     v9.4s,v26.4s,#7
1567         add     w13,w13,w19
1568         sli     v13.4s,v27.4s,#7
1569         add     w14,w14,w20
1570         sli     v17.4s,v28.4s,#7
1571         eor     w10,w10,w15
1572         sli     v21.4s,v29.4s,#7
1573         eor     w11,w11,w16
1574         ext     v2.16b,v2.16b,v2.16b,#8
1575         eor     w12,w12,w13
1576         ext     v6.16b,v6.16b,v6.16b,#8
1577         eor     w9,w9,w14
1578         ext     v10.16b,v10.16b,v10.16b,#8
1579         ror     w10,w10,#25
1580         ext     v14.16b,v14.16b,v14.16b,#8
1581         ror     w11,w11,#25
1582         ext     v18.16b,v18.16b,v18.16b,#8
1583         ror     w12,w12,#25
1584         ext     v22.16b,v22.16b,v22.16b,#8
1585         ror     w9,w9,#25
1586         ext     v3.16b,v3.16b,v3.16b,#12
1587         ext     v7.16b,v7.16b,v7.16b,#12
1588         ext     v11.16b,v11.16b,v11.16b,#12
1589         ext     v15.16b,v15.16b,v15.16b,#12
1590         ext     v19.16b,v19.16b,v19.16b,#12
1591         ext     v23.16b,v23.16b,v23.16b,#12
1592         ext     v1.16b,v1.16b,v1.16b,#4
1593         ext     v5.16b,v5.16b,v5.16b,#4
1594         ext     v9.16b,v9.16b,v9.16b,#4
1595         ext     v13.16b,v13.16b,v13.16b,#4
1596         ext     v17.16b,v17.16b,v17.16b,#4
1597         ext     v21.16b,v21.16b,v21.16b,#4
1598         add     v0.4s,v0.4s,v1.4s
1599         add     w5,w5,w9
1600         add     v4.4s,v4.4s,v5.4s
1601         add     w6,w6,w10
1602         add     v8.4s,v8.4s,v9.4s
1603         add     w7,w7,w11
1604         add     v12.4s,v12.4s,v13.4s
1605         add     w8,w8,w12
1606         add     v16.4s,v16.4s,v17.4s
1607         eor     w17,w17,w5
1608         add     v20.4s,v20.4s,v21.4s
1609         eor     w19,w19,w6
1610         eor     v3.16b,v3.16b,v0.16b
1611         eor     w20,w20,w7
1612         eor     v7.16b,v7.16b,v4.16b
1613         eor     w21,w21,w8
1614         eor     v11.16b,v11.16b,v8.16b
1615         ror     w17,w17,#16
1616         eor     v15.16b,v15.16b,v12.16b
1617         ror     w19,w19,#16
1618         eor     v19.16b,v19.16b,v16.16b
1619         ror     w20,w20,#16
1620         eor     v23.16b,v23.16b,v20.16b
1621         ror     w21,w21,#16
1622         rev32   v3.8h,v3.8h
1623         add     w13,w13,w17
1624         rev32   v7.8h,v7.8h
1625         add     w14,w14,w19
1626         rev32   v11.8h,v11.8h
1627         add     w15,w15,w20
1628         rev32   v15.8h,v15.8h
1629         add     w16,w16,w21
1630         rev32   v19.8h,v19.8h
1631         eor     w9,w9,w13
1632         rev32   v23.8h,v23.8h
1633         eor     w10,w10,w14
1634         add     v2.4s,v2.4s,v3.4s
1635         eor     w11,w11,w15
1636         add     v6.4s,v6.4s,v7.4s
1637         eor     w12,w12,w16
1638         add     v10.4s,v10.4s,v11.4s
1639         ror     w9,w9,#20
1640         add     v14.4s,v14.4s,v15.4s
1641         ror     w10,w10,#20
1642         add     v18.4s,v18.4s,v19.4s
1643         ror     w11,w11,#20
1644         add     v22.4s,v22.4s,v23.4s
1645         ror     w12,w12,#20
1646         eor     v24.16b,v1.16b,v2.16b
1647         add     w5,w5,w9
1648         eor     v25.16b,v5.16b,v6.16b
1649         add     w6,w6,w10
1650         eor     v26.16b,v9.16b,v10.16b
1651         add     w7,w7,w11
1652         eor     v27.16b,v13.16b,v14.16b
1653         add     w8,w8,w12
1654         eor     v28.16b,v17.16b,v18.16b
1655         eor     w17,w17,w5
1656         eor     v29.16b,v21.16b,v22.16b
1657         eor     w19,w19,w6
1658         ushr    v1.4s,v24.4s,#20
1659         eor     w20,w20,w7
1660         ushr    v5.4s,v25.4s,#20
1661         eor     w21,w21,w8
1662         ushr    v9.4s,v26.4s,#20
1663         ror     w17,w17,#24
1664         ushr    v13.4s,v27.4s,#20
1665         ror     w19,w19,#24
1666         ushr    v17.4s,v28.4s,#20
1667         ror     w20,w20,#24
1668         ushr    v21.4s,v29.4s,#20
1669         ror     w21,w21,#24
1670         sli     v1.4s,v24.4s,#12
1671         add     w13,w13,w17
1672         sli     v5.4s,v25.4s,#12
1673         add     w14,w14,w19
1674         sli     v9.4s,v26.4s,#12
1675         add     w15,w15,w20
1676         sli     v13.4s,v27.4s,#12
1677         add     w16,w16,w21
1678         sli     v17.4s,v28.4s,#12
1679         eor     w9,w9,w13
1680         sli     v21.4s,v29.4s,#12
1681         eor     w10,w10,w14
1682         add     v0.4s,v0.4s,v1.4s
1683         eor     w11,w11,w15
1684         add     v4.4s,v4.4s,v5.4s
1685         eor     w12,w12,w16
1686         add     v8.4s,v8.4s,v9.4s
1687         ror     w9,w9,#25
1688         add     v12.4s,v12.4s,v13.4s
1689         ror     w10,w10,#25
1690         add     v16.4s,v16.4s,v17.4s
1691         ror     w11,w11,#25
1692         add     v20.4s,v20.4s,v21.4s
1693         ror     w12,w12,#25
1694         eor     v24.16b,v3.16b,v0.16b
1695         add     w5,w5,w10
1696         eor     v25.16b,v7.16b,v4.16b
1697         add     w6,w6,w11
1698         eor     v26.16b,v11.16b,v8.16b
1699         add     w7,w7,w12
1700         eor     v27.16b,v15.16b,v12.16b
1701         add     w8,w8,w9
1702         eor     v28.16b,v19.16b,v16.16b
1703         eor     w21,w21,w5
1704         eor     v29.16b,v23.16b,v20.16b
1705         eor     w17,w17,w6
1706         ushr    v3.4s,v24.4s,#24
1707         eor     w19,w19,w7
1708         ushr    v7.4s,v25.4s,#24
1709         eor     w20,w20,w8
1710         ushr    v11.4s,v26.4s,#24
1711         ror     w21,w21,#16
1712         ushr    v15.4s,v27.4s,#24
1713         ror     w17,w17,#16
1714         ushr    v19.4s,v28.4s,#24
1715         ror     w19,w19,#16
1716         ushr    v23.4s,v29.4s,#24
1717         ror     w20,w20,#16
1718         sli     v3.4s,v24.4s,#8
1719         add     w15,w15,w21
1720         sli     v7.4s,v25.4s,#8
1721         add     w16,w16,w17
1722         sli     v11.4s,v26.4s,#8
1723         add     w13,w13,w19
1724         sli     v15.4s,v27.4s,#8
1725         add     w14,w14,w20
1726         sli     v19.4s,v28.4s,#8
1727         eor     w10,w10,w15
1728         sli     v23.4s,v29.4s,#8
1729         eor     w11,w11,w16
1730         add     v2.4s,v2.4s,v3.4s
1731         eor     w12,w12,w13
1732         add     v6.4s,v6.4s,v7.4s
1733         eor     w9,w9,w14
1734         add     v10.4s,v10.4s,v11.4s
1735         ror     w10,w10,#20
1736         add     v14.4s,v14.4s,v15.4s
1737         ror     w11,w11,#20
1738         add     v18.4s,v18.4s,v19.4s
1739         ror     w12,w12,#20
1740         add     v22.4s,v22.4s,v23.4s
1741         ror     w9,w9,#20
1742         eor     v24.16b,v1.16b,v2.16b
1743         add     w5,w5,w10
1744         eor     v25.16b,v5.16b,v6.16b
1745         add     w6,w6,w11
1746         eor     v26.16b,v9.16b,v10.16b
1747         add     w7,w7,w12
1748         eor     v27.16b,v13.16b,v14.16b
1749         add     w8,w8,w9
1750         eor     v28.16b,v17.16b,v18.16b
1751         eor     w21,w21,w5
1752         eor     v29.16b,v21.16b,v22.16b
1753         eor     w17,w17,w6
1754         ushr    v1.4s,v24.4s,#25
1755         eor     w19,w19,w7
1756         ushr    v5.4s,v25.4s,#25
1757         eor     w20,w20,w8
1758         ushr    v9.4s,v26.4s,#25
1759         ror     w21,w21,#24
1760         ushr    v13.4s,v27.4s,#25
1761         ror     w17,w17,#24
1762         ushr    v17.4s,v28.4s,#25
1763         ror     w19,w19,#24
1764         ushr    v21.4s,v29.4s,#25
1765         ror     w20,w20,#24
1766         sli     v1.4s,v24.4s,#7
1767         add     w15,w15,w21
1768         sli     v5.4s,v25.4s,#7
1769         add     w16,w16,w17
1770         sli     v9.4s,v26.4s,#7
1771         add     w13,w13,w19
1772         sli     v13.4s,v27.4s,#7
1773         add     w14,w14,w20
1774         sli     v17.4s,v28.4s,#7
1775         eor     w10,w10,w15
1776         sli     v21.4s,v29.4s,#7
1777         eor     w11,w11,w16
1778         ext     v2.16b,v2.16b,v2.16b,#8
1779         eor     w12,w12,w13
1780         ext     v6.16b,v6.16b,v6.16b,#8
1781         eor     w9,w9,w14
1782         ext     v10.16b,v10.16b,v10.16b,#8
1783         ror     w10,w10,#25
1784         ext     v14.16b,v14.16b,v14.16b,#8
1785         ror     w11,w11,#25
1786         ext     v18.16b,v18.16b,v18.16b,#8
1787         ror     w12,w12,#25
1788         ext     v22.16b,v22.16b,v22.16b,#8
1789         ror     w9,w9,#25
1790         ext     v3.16b,v3.16b,v3.16b,#4
1791         ext     v7.16b,v7.16b,v7.16b,#4
1792         ext     v11.16b,v11.16b,v11.16b,#4
1793         ext     v15.16b,v15.16b,v15.16b,#4
1794         ext     v19.16b,v19.16b,v19.16b,#4
1795         ext     v23.16b,v23.16b,v23.16b,#4
1796         ext     v1.16b,v1.16b,v1.16b,#12
1797         ext     v5.16b,v5.16b,v5.16b,#12
1798         ext     v9.16b,v9.16b,v9.16b,#12
1799         ext     v13.16b,v13.16b,v13.16b,#12
1800         ext     v17.16b,v17.16b,v17.16b,#12
1801         ext     v21.16b,v21.16b,v21.16b,#12
1802         cbnz    x4,.Loop_lower_neon
1803
1804         add     w5,w5,w22               // accumulate key block
1805         ldp     q24,q25,[sp,#0]
1806         add     x6,x6,x22,lsr#32
1807         ldp     q26,q27,[sp,#32]
1808         add     w7,w7,w23
1809         ldp     q28,q29,[sp,#64]
1810         add     x8,x8,x23,lsr#32
1811         add     v0.4s,v0.4s,v24.4s
1812         add     w9,w9,w24
1813         add     v4.4s,v4.4s,v24.4s
1814         add     x10,x10,x24,lsr#32
1815         add     v8.4s,v8.4s,v24.4s
1816         add     w11,w11,w25
1817         add     v12.4s,v12.4s,v24.4s
1818         add     x12,x12,x25,lsr#32
1819         add     v16.4s,v16.4s,v24.4s
1820         add     w13,w13,w26
1821         add     v20.4s,v20.4s,v24.4s
1822         add     x14,x14,x26,lsr#32
1823         add     v2.4s,v2.4s,v26.4s
1824         add     w15,w15,w27
1825         add     v6.4s,v6.4s,v26.4s
1826         add     x16,x16,x27,lsr#32
1827         add     v10.4s,v10.4s,v26.4s
1828         add     w17,w17,w28
1829         add     v14.4s,v14.4s,v26.4s
1830         add     x19,x19,x28,lsr#32
1831         add     v18.4s,v18.4s,v26.4s
1832         add     w20,w20,w30
1833         add     v22.4s,v22.4s,v26.4s
1834         add     x21,x21,x30,lsr#32
1835         add     v19.4s,v19.4s,v31.4s                    // +4
1836         add     x5,x5,x6,lsl#32 // pack
1837         add     v23.4s,v23.4s,v31.4s                    // +4
1838         add     x7,x7,x8,lsl#32
1839         add     v3.4s,v3.4s,v27.4s
1840         ldp     x6,x8,[x1,#0]           // load input
1841         add     v7.4s,v7.4s,v28.4s
1842         add     x9,x9,x10,lsl#32
1843         add     v11.4s,v11.4s,v29.4s
1844         add     x11,x11,x12,lsl#32
1845         add     v15.4s,v15.4s,v30.4s
1846         ldp     x10,x12,[x1,#16]
1847         add     v19.4s,v19.4s,v27.4s
1848         add     x13,x13,x14,lsl#32
1849         add     v23.4s,v23.4s,v28.4s
1850         add     x15,x15,x16,lsl#32
1851         add     v1.4s,v1.4s,v25.4s
1852         ldp     x14,x16,[x1,#32]
1853         add     v5.4s,v5.4s,v25.4s
1854         add     x17,x17,x19,lsl#32
1855         add     v9.4s,v9.4s,v25.4s
1856         add     x20,x20,x21,lsl#32
1857         add     v13.4s,v13.4s,v25.4s
1858         ldp     x19,x21,[x1,#48]
1859         add     v17.4s,v17.4s,v25.4s
1860         add     x1,x1,#64
1861         add     v21.4s,v21.4s,v25.4s
1862
1863 #ifdef  __ARMEB__
1864         rev     x5,x5
1865         rev     x7,x7
1866         rev     x9,x9
1867         rev     x11,x11
1868         rev     x13,x13
1869         rev     x15,x15
1870         rev     x17,x17
1871         rev     x20,x20
1872 #endif
1873         ld1     {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1874         eor     x5,x5,x6
1875         eor     x7,x7,x8
1876         eor     x9,x9,x10
1877         eor     x11,x11,x12
1878         eor     x13,x13,x14
1879         eor     v0.16b,v0.16b,v24.16b
1880         eor     x15,x15,x16
1881         eor     v1.16b,v1.16b,v25.16b
1882         eor     x17,x17,x19
1883         eor     v2.16b,v2.16b,v26.16b
1884         eor     x20,x20,x21
1885         eor     v3.16b,v3.16b,v27.16b
1886         ld1     {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1887
1888         stp     x5,x7,[x0,#0]           // store output
1889         add     x28,x28,#7                      // increment counter
1890         stp     x9,x11,[x0,#16]
1891         stp     x13,x15,[x0,#32]
1892         stp     x17,x20,[x0,#48]
1893         add     x0,x0,#64
1894         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1895
1896         ld1     {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1897         eor     v4.16b,v4.16b,v24.16b
1898         eor     v5.16b,v5.16b,v25.16b
1899         eor     v6.16b,v6.16b,v26.16b
1900         eor     v7.16b,v7.16b,v27.16b
1901         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1902
1903         ld1     {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1904         eor     v8.16b,v8.16b,v0.16b
1905         ldp     q24,q25,[sp,#0]
1906         eor     v9.16b,v9.16b,v1.16b
1907         ldp     q26,q27,[sp,#32]
1908         eor     v10.16b,v10.16b,v2.16b
1909         eor     v11.16b,v11.16b,v3.16b
1910         st1     {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1911
1912         ld1     {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1913         eor     v12.16b,v12.16b,v4.16b
1914         eor     v13.16b,v13.16b,v5.16b
1915         eor     v14.16b,v14.16b,v6.16b
1916         eor     v15.16b,v15.16b,v7.16b
1917         st1     {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1918
1919         ld1     {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1920         eor     v16.16b,v16.16b,v8.16b
1921         eor     v17.16b,v17.16b,v9.16b
1922         eor     v18.16b,v18.16b,v10.16b
1923         eor     v19.16b,v19.16b,v11.16b
1924         st1     {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1925
1926         shl     v0.4s,v31.4s,#1                 // 4 -> 8
1927         eor     v20.16b,v20.16b,v12.16b
1928         eor     v21.16b,v21.16b,v13.16b
1929         eor     v22.16b,v22.16b,v14.16b
1930         eor     v23.16b,v23.16b,v15.16b
1931         st1     {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1932
1933         add     v27.4s,v27.4s,v0.4s                     // += 8
1934         add     v28.4s,v28.4s,v0.4s
1935         add     v29.4s,v29.4s,v0.4s
1936         add     v30.4s,v30.4s,v0.4s
1937
1938         b.hs    .Loop_outer_512_neon
1939
1940         adds    x2,x2,#512
1941         ushr    v0.4s,v31.4s,#2                 // 4 -> 1
1942
1943         ldp     d8,d9,[sp,#128+0]               // meet ABI requirements
1944         ldp     d10,d11,[sp,#128+16]
1945         ldp     d12,d13,[sp,#128+32]
1946         ldp     d14,d15,[sp,#128+48]
1947
1948         stp     q24,q31,[sp,#0]         // wipe off-load area
1949         stp     q24,q31,[sp,#32]
1950         stp     q24,q31,[sp,#64]
1951
1952         b.eq    .Ldone_512_neon
1953
1954         cmp     x2,#192
1955         sub     v27.4s,v27.4s,v0.4s                     // -= 1
1956         sub     v28.4s,v28.4s,v0.4s
1957         sub     v29.4s,v29.4s,v0.4s
1958         add     sp,sp,#128
1959         b.hs    .Loop_outer_neon
1960
1961         eor     v25.16b,v25.16b,v25.16b
1962         eor     v26.16b,v26.16b,v26.16b
1963         eor     v27.16b,v27.16b,v27.16b
1964         eor     v28.16b,v28.16b,v28.16b
1965         eor     v29.16b,v29.16b,v29.16b
1966         eor     v30.16b,v30.16b,v30.16b
1967         b       .Loop_outer
1968
1969 .Ldone_512_neon:
1970         ldp     x19,x20,[x29,#16]
1971         add     sp,sp,#128+64
1972         ldp     x21,x22,[x29,#32]
1973         ldp     x23,x24,[x29,#48]
1974         ldp     x25,x26,[x29,#64]
1975         ldp     x27,x28,[x29,#80]
1976         ldp     x29,x30,[sp],#96
1977 .inst   0xd50323bf                      // autiasp
1978         ret
1979 .size   ChaCha20_512_neon,.-ChaCha20_512_neon