]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/openssl/aarch64/chacha-armv8.S
Use a template assembly file to generate the embedded MFS.
[FreeBSD/FreeBSD.git] / sys / crypto / openssl / aarch64 / chacha-armv8.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from chacha-armv8.pl. */
3 #include "arm_arch.h"
4
5 .text
6
7
8
9 .align  5
10 .Lsigma:
11 .quad   0x3320646e61707865,0x6b20657479622d32           // endian-neutral
12 .Lone:
13 .long   1,0,0,0
14 .LOPENSSL_armcap_P:
15 #ifdef  __ILP32__
16 .long   OPENSSL_armcap_P-.
17 #else
18 .quad   OPENSSL_armcap_P-.
19 #endif
20 .byte   67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
21 .align  2
22
23 .globl  ChaCha20_ctr32
24 .type   ChaCha20_ctr32,%function
25 .align  5
26 ChaCha20_ctr32:
27         cbz     x2,.Labort
28         adr     x5,.LOPENSSL_armcap_P
29         cmp     x2,#192
30         b.lo    .Lshort
31 #ifdef  __ILP32__
32         ldrsw   x6,[x5]
33 #else
34         ldr     x6,[x5]
35 #endif
36         ldr     w17,[x6,x5]
37         tst     w17,#ARMV7_NEON
38         b.ne    ChaCha20_neon
39
40 .Lshort:
41 .inst   0xd503233f                      // paciasp
42         stp     x29,x30,[sp,#-96]!
43         add     x29,sp,#0
44
45         adr     x5,.Lsigma
46         stp     x19,x20,[sp,#16]
47         stp     x21,x22,[sp,#32]
48         stp     x23,x24,[sp,#48]
49         stp     x25,x26,[sp,#64]
50         stp     x27,x28,[sp,#80]
51         sub     sp,sp,#64
52
53         ldp     x22,x23,[x5]            // load sigma
54         ldp     x24,x25,[x3]            // load key
55         ldp     x26,x27,[x3,#16]
56         ldp     x28,x30,[x4]            // load counter
57 #ifdef  __ARMEB__
58         ror     x24,x24,#32
59         ror     x25,x25,#32
60         ror     x26,x26,#32
61         ror     x27,x27,#32
62         ror     x28,x28,#32
63         ror     x30,x30,#32
64 #endif
65
66 .Loop_outer:
67         mov     w5,w22                  // unpack key block
68         lsr     x6,x22,#32
69         mov     w7,w23
70         lsr     x8,x23,#32
71         mov     w9,w24
72         lsr     x10,x24,#32
73         mov     w11,w25
74         lsr     x12,x25,#32
75         mov     w13,w26
76         lsr     x14,x26,#32
77         mov     w15,w27
78         lsr     x16,x27,#32
79         mov     w17,w28
80         lsr     x19,x28,#32
81         mov     w20,w30
82         lsr     x21,x30,#32
83
84         mov     x4,#10
85         subs    x2,x2,#64
86 .Loop:
87         sub     x4,x4,#1
88         add     w5,w5,w9
89         add     w6,w6,w10
90         add     w7,w7,w11
91         add     w8,w8,w12
92         eor     w17,w17,w5
93         eor     w19,w19,w6
94         eor     w20,w20,w7
95         eor     w21,w21,w8
96         ror     w17,w17,#16
97         ror     w19,w19,#16
98         ror     w20,w20,#16
99         ror     w21,w21,#16
100         add     w13,w13,w17
101         add     w14,w14,w19
102         add     w15,w15,w20
103         add     w16,w16,w21
104         eor     w9,w9,w13
105         eor     w10,w10,w14
106         eor     w11,w11,w15
107         eor     w12,w12,w16
108         ror     w9,w9,#20
109         ror     w10,w10,#20
110         ror     w11,w11,#20
111         ror     w12,w12,#20
112         add     w5,w5,w9
113         add     w6,w6,w10
114         add     w7,w7,w11
115         add     w8,w8,w12
116         eor     w17,w17,w5
117         eor     w19,w19,w6
118         eor     w20,w20,w7
119         eor     w21,w21,w8
120         ror     w17,w17,#24
121         ror     w19,w19,#24
122         ror     w20,w20,#24
123         ror     w21,w21,#24
124         add     w13,w13,w17
125         add     w14,w14,w19
126         add     w15,w15,w20
127         add     w16,w16,w21
128         eor     w9,w9,w13
129         eor     w10,w10,w14
130         eor     w11,w11,w15
131         eor     w12,w12,w16
132         ror     w9,w9,#25
133         ror     w10,w10,#25
134         ror     w11,w11,#25
135         ror     w12,w12,#25
136         add     w5,w5,w10
137         add     w6,w6,w11
138         add     w7,w7,w12
139         add     w8,w8,w9
140         eor     w21,w21,w5
141         eor     w17,w17,w6
142         eor     w19,w19,w7
143         eor     w20,w20,w8
144         ror     w21,w21,#16
145         ror     w17,w17,#16
146         ror     w19,w19,#16
147         ror     w20,w20,#16
148         add     w15,w15,w21
149         add     w16,w16,w17
150         add     w13,w13,w19
151         add     w14,w14,w20
152         eor     w10,w10,w15
153         eor     w11,w11,w16
154         eor     w12,w12,w13
155         eor     w9,w9,w14
156         ror     w10,w10,#20
157         ror     w11,w11,#20
158         ror     w12,w12,#20
159         ror     w9,w9,#20
160         add     w5,w5,w10
161         add     w6,w6,w11
162         add     w7,w7,w12
163         add     w8,w8,w9
164         eor     w21,w21,w5
165         eor     w17,w17,w6
166         eor     w19,w19,w7
167         eor     w20,w20,w8
168         ror     w21,w21,#24
169         ror     w17,w17,#24
170         ror     w19,w19,#24
171         ror     w20,w20,#24
172         add     w15,w15,w21
173         add     w16,w16,w17
174         add     w13,w13,w19
175         add     w14,w14,w20
176         eor     w10,w10,w15
177         eor     w11,w11,w16
178         eor     w12,w12,w13
179         eor     w9,w9,w14
180         ror     w10,w10,#25
181         ror     w11,w11,#25
182         ror     w12,w12,#25
183         ror     w9,w9,#25
184         cbnz    x4,.Loop
185
186         add     w5,w5,w22               // accumulate key block
187         add     x6,x6,x22,lsr#32
188         add     w7,w7,w23
189         add     x8,x8,x23,lsr#32
190         add     w9,w9,w24
191         add     x10,x10,x24,lsr#32
192         add     w11,w11,w25
193         add     x12,x12,x25,lsr#32
194         add     w13,w13,w26
195         add     x14,x14,x26,lsr#32
196         add     w15,w15,w27
197         add     x16,x16,x27,lsr#32
198         add     w17,w17,w28
199         add     x19,x19,x28,lsr#32
200         add     w20,w20,w30
201         add     x21,x21,x30,lsr#32
202
203         b.lo    .Ltail
204
205         add     x5,x5,x6,lsl#32 // pack
206         add     x7,x7,x8,lsl#32
207         ldp     x6,x8,[x1,#0]           // load input
208         add     x9,x9,x10,lsl#32
209         add     x11,x11,x12,lsl#32
210         ldp     x10,x12,[x1,#16]
211         add     x13,x13,x14,lsl#32
212         add     x15,x15,x16,lsl#32
213         ldp     x14,x16,[x1,#32]
214         add     x17,x17,x19,lsl#32
215         add     x20,x20,x21,lsl#32
216         ldp     x19,x21,[x1,#48]
217         add     x1,x1,#64
218 #ifdef  __ARMEB__
219         rev     x5,x5
220         rev     x7,x7
221         rev     x9,x9
222         rev     x11,x11
223         rev     x13,x13
224         rev     x15,x15
225         rev     x17,x17
226         rev     x20,x20
227 #endif
228         eor     x5,x5,x6
229         eor     x7,x7,x8
230         eor     x9,x9,x10
231         eor     x11,x11,x12
232         eor     x13,x13,x14
233         eor     x15,x15,x16
234         eor     x17,x17,x19
235         eor     x20,x20,x21
236
237         stp     x5,x7,[x0,#0]           // store output
238         add     x28,x28,#1                      // increment counter
239         stp     x9,x11,[x0,#16]
240         stp     x13,x15,[x0,#32]
241         stp     x17,x20,[x0,#48]
242         add     x0,x0,#64
243
244         b.hi    .Loop_outer
245
246         ldp     x19,x20,[x29,#16]
247         add     sp,sp,#64
248         ldp     x21,x22,[x29,#32]
249         ldp     x23,x24,[x29,#48]
250         ldp     x25,x26,[x29,#64]
251         ldp     x27,x28,[x29,#80]
252         ldp     x29,x30,[sp],#96
253 .inst   0xd50323bf                      // autiasp
254 .Labort:
255         ret
256
257 .align  4
258 .Ltail:
259         add     x2,x2,#64
260 .Less_than_64:
261         sub     x0,x0,#1
262         add     x1,x1,x2
263         add     x0,x0,x2
264         add     x4,sp,x2
265         neg     x2,x2
266
267         add     x5,x5,x6,lsl#32 // pack
268         add     x7,x7,x8,lsl#32
269         add     x9,x9,x10,lsl#32
270         add     x11,x11,x12,lsl#32
271         add     x13,x13,x14,lsl#32
272         add     x15,x15,x16,lsl#32
273         add     x17,x17,x19,lsl#32
274         add     x20,x20,x21,lsl#32
275 #ifdef  __ARMEB__
276         rev     x5,x5
277         rev     x7,x7
278         rev     x9,x9
279         rev     x11,x11
280         rev     x13,x13
281         rev     x15,x15
282         rev     x17,x17
283         rev     x20,x20
284 #endif
285         stp     x5,x7,[sp,#0]
286         stp     x9,x11,[sp,#16]
287         stp     x13,x15,[sp,#32]
288         stp     x17,x20,[sp,#48]
289
290 .Loop_tail:
291         ldrb    w10,[x1,x2]
292         ldrb    w11,[x4,x2]
293         add     x2,x2,#1
294         eor     w10,w10,w11
295         strb    w10,[x0,x2]
296         cbnz    x2,.Loop_tail
297
298         stp     xzr,xzr,[sp,#0]
299         stp     xzr,xzr,[sp,#16]
300         stp     xzr,xzr,[sp,#32]
301         stp     xzr,xzr,[sp,#48]
302
303         ldp     x19,x20,[x29,#16]
304         add     sp,sp,#64
305         ldp     x21,x22,[x29,#32]
306         ldp     x23,x24,[x29,#48]
307         ldp     x25,x26,[x29,#64]
308         ldp     x27,x28,[x29,#80]
309         ldp     x29,x30,[sp],#96
310 .inst   0xd50323bf                      // autiasp
311         ret
312 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
313
314 .type   ChaCha20_neon,%function
315 .align  5
316 ChaCha20_neon:
317 .inst   0xd503233f                      // paciasp
318         stp     x29,x30,[sp,#-96]!
319         add     x29,sp,#0
320
321         adr     x5,.Lsigma
322         stp     x19,x20,[sp,#16]
323         stp     x21,x22,[sp,#32]
324         stp     x23,x24,[sp,#48]
325         stp     x25,x26,[sp,#64]
326         stp     x27,x28,[sp,#80]
327         cmp     x2,#512
328         b.hs    .L512_or_more_neon
329
330         sub     sp,sp,#64
331
332         ldp     x22,x23,[x5]            // load sigma
333         ld1     {v24.4s},[x5],#16
334         ldp     x24,x25,[x3]            // load key
335         ldp     x26,x27,[x3,#16]
336         ld1     {v25.4s,v26.4s},[x3]
337         ldp     x28,x30,[x4]            // load counter
338         ld1     {v27.4s},[x4]
339         ld1     {v31.4s},[x5]
340 #ifdef  __ARMEB__
341         rev64   v24.4s,v24.4s
342         ror     x24,x24,#32
343         ror     x25,x25,#32
344         ror     x26,x26,#32
345         ror     x27,x27,#32
346         ror     x28,x28,#32
347         ror     x30,x30,#32
348 #endif
349         add     v27.4s,v27.4s,v31.4s            // += 1
350         add     v28.4s,v27.4s,v31.4s
351         add     v29.4s,v28.4s,v31.4s
352         shl     v31.4s,v31.4s,#2                        // 1 -> 4
353
354 .Loop_outer_neon:
355         mov     w5,w22                  // unpack key block
356         lsr     x6,x22,#32
357         mov     v0.16b,v24.16b
358         mov     w7,w23
359         lsr     x8,x23,#32
360         mov     v4.16b,v24.16b
361         mov     w9,w24
362         lsr     x10,x24,#32
363         mov     v16.16b,v24.16b
364         mov     w11,w25
365         mov     v1.16b,v25.16b
366         lsr     x12,x25,#32
367         mov     v5.16b,v25.16b
368         mov     w13,w26
369         mov     v17.16b,v25.16b
370         lsr     x14,x26,#32
371         mov     v3.16b,v27.16b
372         mov     w15,w27
373         mov     v7.16b,v28.16b
374         lsr     x16,x27,#32
375         mov     v19.16b,v29.16b
376         mov     w17,w28
377         mov     v2.16b,v26.16b
378         lsr     x19,x28,#32
379         mov     v6.16b,v26.16b
380         mov     w20,w30
381         mov     v18.16b,v26.16b
382         lsr     x21,x30,#32
383
384         mov     x4,#10
385         subs    x2,x2,#256
386 .Loop_neon:
387         sub     x4,x4,#1
388         add     v0.4s,v0.4s,v1.4s
389         add     w5,w5,w9
390         add     v4.4s,v4.4s,v5.4s
391         add     w6,w6,w10
392         add     v16.4s,v16.4s,v17.4s
393         add     w7,w7,w11
394         eor     v3.16b,v3.16b,v0.16b
395         add     w8,w8,w12
396         eor     v7.16b,v7.16b,v4.16b
397         eor     w17,w17,w5
398         eor     v19.16b,v19.16b,v16.16b
399         eor     w19,w19,w6
400         rev32   v3.8h,v3.8h
401         eor     w20,w20,w7
402         rev32   v7.8h,v7.8h
403         eor     w21,w21,w8
404         rev32   v19.8h,v19.8h
405         ror     w17,w17,#16
406         add     v2.4s,v2.4s,v3.4s
407         ror     w19,w19,#16
408         add     v6.4s,v6.4s,v7.4s
409         ror     w20,w20,#16
410         add     v18.4s,v18.4s,v19.4s
411         ror     w21,w21,#16
412         eor     v20.16b,v1.16b,v2.16b
413         add     w13,w13,w17
414         eor     v21.16b,v5.16b,v6.16b
415         add     w14,w14,w19
416         eor     v22.16b,v17.16b,v18.16b
417         add     w15,w15,w20
418         ushr    v1.4s,v20.4s,#20
419         add     w16,w16,w21
420         ushr    v5.4s,v21.4s,#20
421         eor     w9,w9,w13
422         ushr    v17.4s,v22.4s,#20
423         eor     w10,w10,w14
424         sli     v1.4s,v20.4s,#12
425         eor     w11,w11,w15
426         sli     v5.4s,v21.4s,#12
427         eor     w12,w12,w16
428         sli     v17.4s,v22.4s,#12
429         ror     w9,w9,#20
430         add     v0.4s,v0.4s,v1.4s
431         ror     w10,w10,#20
432         add     v4.4s,v4.4s,v5.4s
433         ror     w11,w11,#20
434         add     v16.4s,v16.4s,v17.4s
435         ror     w12,w12,#20
436         eor     v20.16b,v3.16b,v0.16b
437         add     w5,w5,w9
438         eor     v21.16b,v7.16b,v4.16b
439         add     w6,w6,w10
440         eor     v22.16b,v19.16b,v16.16b
441         add     w7,w7,w11
442         ushr    v3.4s,v20.4s,#24
443         add     w8,w8,w12
444         ushr    v7.4s,v21.4s,#24
445         eor     w17,w17,w5
446         ushr    v19.4s,v22.4s,#24
447         eor     w19,w19,w6
448         sli     v3.4s,v20.4s,#8
449         eor     w20,w20,w7
450         sli     v7.4s,v21.4s,#8
451         eor     w21,w21,w8
452         sli     v19.4s,v22.4s,#8
453         ror     w17,w17,#24
454         add     v2.4s,v2.4s,v3.4s
455         ror     w19,w19,#24
456         add     v6.4s,v6.4s,v7.4s
457         ror     w20,w20,#24
458         add     v18.4s,v18.4s,v19.4s
459         ror     w21,w21,#24
460         eor     v20.16b,v1.16b,v2.16b
461         add     w13,w13,w17
462         eor     v21.16b,v5.16b,v6.16b
463         add     w14,w14,w19
464         eor     v22.16b,v17.16b,v18.16b
465         add     w15,w15,w20
466         ushr    v1.4s,v20.4s,#25
467         add     w16,w16,w21
468         ushr    v5.4s,v21.4s,#25
469         eor     w9,w9,w13
470         ushr    v17.4s,v22.4s,#25
471         eor     w10,w10,w14
472         sli     v1.4s,v20.4s,#7
473         eor     w11,w11,w15
474         sli     v5.4s,v21.4s,#7
475         eor     w12,w12,w16
476         sli     v17.4s,v22.4s,#7
477         ror     w9,w9,#25
478         ext     v2.16b,v2.16b,v2.16b,#8
479         ror     w10,w10,#25
480         ext     v6.16b,v6.16b,v6.16b,#8
481         ror     w11,w11,#25
482         ext     v18.16b,v18.16b,v18.16b,#8
483         ror     w12,w12,#25
484         ext     v3.16b,v3.16b,v3.16b,#12
485         ext     v7.16b,v7.16b,v7.16b,#12
486         ext     v19.16b,v19.16b,v19.16b,#12
487         ext     v1.16b,v1.16b,v1.16b,#4
488         ext     v5.16b,v5.16b,v5.16b,#4
489         ext     v17.16b,v17.16b,v17.16b,#4
490         add     v0.4s,v0.4s,v1.4s
491         add     w5,w5,w10
492         add     v4.4s,v4.4s,v5.4s
493         add     w6,w6,w11
494         add     v16.4s,v16.4s,v17.4s
495         add     w7,w7,w12
496         eor     v3.16b,v3.16b,v0.16b
497         add     w8,w8,w9
498         eor     v7.16b,v7.16b,v4.16b
499         eor     w21,w21,w5
500         eor     v19.16b,v19.16b,v16.16b
501         eor     w17,w17,w6
502         rev32   v3.8h,v3.8h
503         eor     w19,w19,w7
504         rev32   v7.8h,v7.8h
505         eor     w20,w20,w8
506         rev32   v19.8h,v19.8h
507         ror     w21,w21,#16
508         add     v2.4s,v2.4s,v3.4s
509         ror     w17,w17,#16
510         add     v6.4s,v6.4s,v7.4s
511         ror     w19,w19,#16
512         add     v18.4s,v18.4s,v19.4s
513         ror     w20,w20,#16
514         eor     v20.16b,v1.16b,v2.16b
515         add     w15,w15,w21
516         eor     v21.16b,v5.16b,v6.16b
517         add     w16,w16,w17
518         eor     v22.16b,v17.16b,v18.16b
519         add     w13,w13,w19
520         ushr    v1.4s,v20.4s,#20
521         add     w14,w14,w20
522         ushr    v5.4s,v21.4s,#20
523         eor     w10,w10,w15
524         ushr    v17.4s,v22.4s,#20
525         eor     w11,w11,w16
526         sli     v1.4s,v20.4s,#12
527         eor     w12,w12,w13
528         sli     v5.4s,v21.4s,#12
529         eor     w9,w9,w14
530         sli     v17.4s,v22.4s,#12
531         ror     w10,w10,#20
532         add     v0.4s,v0.4s,v1.4s
533         ror     w11,w11,#20
534         add     v4.4s,v4.4s,v5.4s
535         ror     w12,w12,#20
536         add     v16.4s,v16.4s,v17.4s
537         ror     w9,w9,#20
538         eor     v20.16b,v3.16b,v0.16b
539         add     w5,w5,w10
540         eor     v21.16b,v7.16b,v4.16b
541         add     w6,w6,w11
542         eor     v22.16b,v19.16b,v16.16b
543         add     w7,w7,w12
544         ushr    v3.4s,v20.4s,#24
545         add     w8,w8,w9
546         ushr    v7.4s,v21.4s,#24
547         eor     w21,w21,w5
548         ushr    v19.4s,v22.4s,#24
549         eor     w17,w17,w6
550         sli     v3.4s,v20.4s,#8
551         eor     w19,w19,w7
552         sli     v7.4s,v21.4s,#8
553         eor     w20,w20,w8
554         sli     v19.4s,v22.4s,#8
555         ror     w21,w21,#24
556         add     v2.4s,v2.4s,v3.4s
557         ror     w17,w17,#24
558         add     v6.4s,v6.4s,v7.4s
559         ror     w19,w19,#24
560         add     v18.4s,v18.4s,v19.4s
561         ror     w20,w20,#24
562         eor     v20.16b,v1.16b,v2.16b
563         add     w15,w15,w21
564         eor     v21.16b,v5.16b,v6.16b
565         add     w16,w16,w17
566         eor     v22.16b,v17.16b,v18.16b
567         add     w13,w13,w19
568         ushr    v1.4s,v20.4s,#25
569         add     w14,w14,w20
570         ushr    v5.4s,v21.4s,#25
571         eor     w10,w10,w15
572         ushr    v17.4s,v22.4s,#25
573         eor     w11,w11,w16
574         sli     v1.4s,v20.4s,#7
575         eor     w12,w12,w13
576         sli     v5.4s,v21.4s,#7
577         eor     w9,w9,w14
578         sli     v17.4s,v22.4s,#7
579         ror     w10,w10,#25
580         ext     v2.16b,v2.16b,v2.16b,#8
581         ror     w11,w11,#25
582         ext     v6.16b,v6.16b,v6.16b,#8
583         ror     w12,w12,#25
584         ext     v18.16b,v18.16b,v18.16b,#8
585         ror     w9,w9,#25
586         ext     v3.16b,v3.16b,v3.16b,#4
587         ext     v7.16b,v7.16b,v7.16b,#4
588         ext     v19.16b,v19.16b,v19.16b,#4
589         ext     v1.16b,v1.16b,v1.16b,#12
590         ext     v5.16b,v5.16b,v5.16b,#12
591         ext     v17.16b,v17.16b,v17.16b,#12
592         cbnz    x4,.Loop_neon
593
594         add     w5,w5,w22               // accumulate key block
595         add     v0.4s,v0.4s,v24.4s
596         add     x6,x6,x22,lsr#32
597         add     v4.4s,v4.4s,v24.4s
598         add     w7,w7,w23
599         add     v16.4s,v16.4s,v24.4s
600         add     x8,x8,x23,lsr#32
601         add     v2.4s,v2.4s,v26.4s
602         add     w9,w9,w24
603         add     v6.4s,v6.4s,v26.4s
604         add     x10,x10,x24,lsr#32
605         add     v18.4s,v18.4s,v26.4s
606         add     w11,w11,w25
607         add     v3.4s,v3.4s,v27.4s
608         add     x12,x12,x25,lsr#32
609         add     w13,w13,w26
610         add     v7.4s,v7.4s,v28.4s
611         add     x14,x14,x26,lsr#32
612         add     w15,w15,w27
613         add     v19.4s,v19.4s,v29.4s
614         add     x16,x16,x27,lsr#32
615         add     w17,w17,w28
616         add     v1.4s,v1.4s,v25.4s
617         add     x19,x19,x28,lsr#32
618         add     w20,w20,w30
619         add     v5.4s,v5.4s,v25.4s
620         add     x21,x21,x30,lsr#32
621         add     v17.4s,v17.4s,v25.4s
622
623         b.lo    .Ltail_neon
624
625         add     x5,x5,x6,lsl#32 // pack
626         add     x7,x7,x8,lsl#32
627         ldp     x6,x8,[x1,#0]           // load input
628         add     x9,x9,x10,lsl#32
629         add     x11,x11,x12,lsl#32
630         ldp     x10,x12,[x1,#16]
631         add     x13,x13,x14,lsl#32
632         add     x15,x15,x16,lsl#32
633         ldp     x14,x16,[x1,#32]
634         add     x17,x17,x19,lsl#32
635         add     x20,x20,x21,lsl#32
636         ldp     x19,x21,[x1,#48]
637         add     x1,x1,#64
638 #ifdef  __ARMEB__
639         rev     x5,x5
640         rev     x7,x7
641         rev     x9,x9
642         rev     x11,x11
643         rev     x13,x13
644         rev     x15,x15
645         rev     x17,x17
646         rev     x20,x20
647 #endif
648         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
649         eor     x5,x5,x6
650         eor     x7,x7,x8
651         eor     x9,x9,x10
652         eor     x11,x11,x12
653         eor     x13,x13,x14
654         eor     v0.16b,v0.16b,v20.16b
655         eor     x15,x15,x16
656         eor     v1.16b,v1.16b,v21.16b
657         eor     x17,x17,x19
658         eor     v2.16b,v2.16b,v22.16b
659         eor     x20,x20,x21
660         eor     v3.16b,v3.16b,v23.16b
661         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
662
663         stp     x5,x7,[x0,#0]           // store output
664         add     x28,x28,#4                      // increment counter
665         stp     x9,x11,[x0,#16]
666         add     v27.4s,v27.4s,v31.4s            // += 4
667         stp     x13,x15,[x0,#32]
668         add     v28.4s,v28.4s,v31.4s
669         stp     x17,x20,[x0,#48]
670         add     v29.4s,v29.4s,v31.4s
671         add     x0,x0,#64
672
673         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
674         ld1     {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
675
676         eor     v4.16b,v4.16b,v20.16b
677         eor     v5.16b,v5.16b,v21.16b
678         eor     v6.16b,v6.16b,v22.16b
679         eor     v7.16b,v7.16b,v23.16b
680         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
681
682         eor     v16.16b,v16.16b,v0.16b
683         eor     v17.16b,v17.16b,v1.16b
684         eor     v18.16b,v18.16b,v2.16b
685         eor     v19.16b,v19.16b,v3.16b
686         st1     {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
687
688         b.hi    .Loop_outer_neon
689
690         ldp     x19,x20,[x29,#16]
691         add     sp,sp,#64
692         ldp     x21,x22,[x29,#32]
693         ldp     x23,x24,[x29,#48]
694         ldp     x25,x26,[x29,#64]
695         ldp     x27,x28,[x29,#80]
696         ldp     x29,x30,[sp],#96
697 .inst   0xd50323bf                      // autiasp
698         ret
699
700 .Ltail_neon:
701         add     x2,x2,#256
702         cmp     x2,#64
703         b.lo    .Less_than_64
704
705         add     x5,x5,x6,lsl#32 // pack
706         add     x7,x7,x8,lsl#32
707         ldp     x6,x8,[x1,#0]           // load input
708         add     x9,x9,x10,lsl#32
709         add     x11,x11,x12,lsl#32
710         ldp     x10,x12,[x1,#16]
711         add     x13,x13,x14,lsl#32
712         add     x15,x15,x16,lsl#32
713         ldp     x14,x16,[x1,#32]
714         add     x17,x17,x19,lsl#32
715         add     x20,x20,x21,lsl#32
716         ldp     x19,x21,[x1,#48]
717         add     x1,x1,#64
718 #ifdef  __ARMEB__
719         rev     x5,x5
720         rev     x7,x7
721         rev     x9,x9
722         rev     x11,x11
723         rev     x13,x13
724         rev     x15,x15
725         rev     x17,x17
726         rev     x20,x20
727 #endif
728         eor     x5,x5,x6
729         eor     x7,x7,x8
730         eor     x9,x9,x10
731         eor     x11,x11,x12
732         eor     x13,x13,x14
733         eor     x15,x15,x16
734         eor     x17,x17,x19
735         eor     x20,x20,x21
736
737         stp     x5,x7,[x0,#0]           // store output
738         add     x28,x28,#4                      // increment counter
739         stp     x9,x11,[x0,#16]
740         stp     x13,x15,[x0,#32]
741         stp     x17,x20,[x0,#48]
742         add     x0,x0,#64
743         b.eq    .Ldone_neon
744         sub     x2,x2,#64
745         cmp     x2,#64
746         b.lo    .Less_than_128
747
748         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
749         eor     v0.16b,v0.16b,v20.16b
750         eor     v1.16b,v1.16b,v21.16b
751         eor     v2.16b,v2.16b,v22.16b
752         eor     v3.16b,v3.16b,v23.16b
753         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
754         b.eq    .Ldone_neon
755         sub     x2,x2,#64
756         cmp     x2,#64
757         b.lo    .Less_than_192
758
759         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
760         eor     v4.16b,v4.16b,v20.16b
761         eor     v5.16b,v5.16b,v21.16b
762         eor     v6.16b,v6.16b,v22.16b
763         eor     v7.16b,v7.16b,v23.16b
764         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
765         b.eq    .Ldone_neon
766         sub     x2,x2,#64
767
768         st1     {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
769         b       .Last_neon
770
771 .Less_than_128:
772         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
773         b       .Last_neon
774 .Less_than_192:
775         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
776         b       .Last_neon
777
778 .align  4
779 .Last_neon:
780         sub     x0,x0,#1
781         add     x1,x1,x2
782         add     x0,x0,x2
783         add     x4,sp,x2
784         neg     x2,x2
785
786 .Loop_tail_neon:
787         ldrb    w10,[x1,x2]
788         ldrb    w11,[x4,x2]
789         add     x2,x2,#1
790         eor     w10,w10,w11
791         strb    w10,[x0,x2]
792         cbnz    x2,.Loop_tail_neon
793
794         stp     xzr,xzr,[sp,#0]
795         stp     xzr,xzr,[sp,#16]
796         stp     xzr,xzr,[sp,#32]
797         stp     xzr,xzr,[sp,#48]
798
799 .Ldone_neon:
800         ldp     x19,x20,[x29,#16]
801         add     sp,sp,#64
802         ldp     x21,x22,[x29,#32]
803         ldp     x23,x24,[x29,#48]
804         ldp     x25,x26,[x29,#64]
805         ldp     x27,x28,[x29,#80]
806         ldp     x29,x30,[sp],#96
807 .inst   0xd50323bf                      // autiasp
808         ret
809 .size   ChaCha20_neon,.-ChaCha20_neon
810 .type   ChaCha20_512_neon,%function
811 .align  5
812 ChaCha20_512_neon:
813 .inst   0xd503233f                      // paciasp
814         stp     x29,x30,[sp,#-96]!
815         add     x29,sp,#0
816
817         adr     x5,.Lsigma
818         stp     x19,x20,[sp,#16]
819         stp     x21,x22,[sp,#32]
820         stp     x23,x24,[sp,#48]
821         stp     x25,x26,[sp,#64]
822         stp     x27,x28,[sp,#80]
823
824 .L512_or_more_neon:
825         sub     sp,sp,#128+64
826
827         ldp     x22,x23,[x5]            // load sigma
828         ld1     {v24.4s},[x5],#16
829         ldp     x24,x25,[x3]            // load key
830         ldp     x26,x27,[x3,#16]
831         ld1     {v25.4s,v26.4s},[x3]
832         ldp     x28,x30,[x4]            // load counter
833         ld1     {v27.4s},[x4]
834         ld1     {v31.4s},[x5]
835 #ifdef  __ARMEB__
836         rev64   v24.4s,v24.4s
837         ror     x24,x24,#32
838         ror     x25,x25,#32
839         ror     x26,x26,#32
840         ror     x27,x27,#32
841         ror     x28,x28,#32
842         ror     x30,x30,#32
843 #endif
844         add     v27.4s,v27.4s,v31.4s            // += 1
845         stp     q24,q25,[sp,#0]         // off-load key block, invariant part
846         add     v27.4s,v27.4s,v31.4s            // not typo
847         str     q26,[sp,#32]
848         add     v28.4s,v27.4s,v31.4s
849         add     v29.4s,v28.4s,v31.4s
850         add     v30.4s,v29.4s,v31.4s
851         shl     v31.4s,v31.4s,#2                        // 1 -> 4
852
853         stp     d8,d9,[sp,#128+0]               // meet ABI requirements
854         stp     d10,d11,[sp,#128+16]
855         stp     d12,d13,[sp,#128+32]
856         stp     d14,d15,[sp,#128+48]
857
858         sub     x2,x2,#512                      // not typo
859
860 .Loop_outer_512_neon:
861         mov     v0.16b,v24.16b
862         mov     v4.16b,v24.16b
863         mov     v8.16b,v24.16b
864         mov     v12.16b,v24.16b
865         mov     v16.16b,v24.16b
866         mov     v20.16b,v24.16b
867         mov     v1.16b,v25.16b
868         mov     w5,w22                  // unpack key block
869         mov     v5.16b,v25.16b
870         lsr     x6,x22,#32
871         mov     v9.16b,v25.16b
872         mov     w7,w23
873         mov     v13.16b,v25.16b
874         lsr     x8,x23,#32
875         mov     v17.16b,v25.16b
876         mov     w9,w24
877         mov     v21.16b,v25.16b
878         lsr     x10,x24,#32
879         mov     v3.16b,v27.16b
880         mov     w11,w25
881         mov     v7.16b,v28.16b
882         lsr     x12,x25,#32
883         mov     v11.16b,v29.16b
884         mov     w13,w26
885         mov     v15.16b,v30.16b
886         lsr     x14,x26,#32
887         mov     v2.16b,v26.16b
888         mov     w15,w27
889         mov     v6.16b,v26.16b
890         lsr     x16,x27,#32
891         add     v19.4s,v3.4s,v31.4s                     // +4
892         mov     w17,w28
893         add     v23.4s,v7.4s,v31.4s                     // +4
894         lsr     x19,x28,#32
895         mov     v10.16b,v26.16b
896         mov     w20,w30
897         mov     v14.16b,v26.16b
898         lsr     x21,x30,#32
899         mov     v18.16b,v26.16b
900         stp     q27,q28,[sp,#48]                // off-load key block, variable part
901         mov     v22.16b,v26.16b
902         str     q29,[sp,#80]
903
904         mov     x4,#5
905         subs    x2,x2,#512
906 .Loop_upper_neon:
907         sub     x4,x4,#1
908         add     v0.4s,v0.4s,v1.4s
909         add     w5,w5,w9
910         add     v4.4s,v4.4s,v5.4s
911         add     w6,w6,w10
912         add     v8.4s,v8.4s,v9.4s
913         add     w7,w7,w11
914         add     v12.4s,v12.4s,v13.4s
915         add     w8,w8,w12
916         add     v16.4s,v16.4s,v17.4s
917         eor     w17,w17,w5
918         add     v20.4s,v20.4s,v21.4s
919         eor     w19,w19,w6
920         eor     v3.16b,v3.16b,v0.16b
921         eor     w20,w20,w7
922         eor     v7.16b,v7.16b,v4.16b
923         eor     w21,w21,w8
924         eor     v11.16b,v11.16b,v8.16b
925         ror     w17,w17,#16
926         eor     v15.16b,v15.16b,v12.16b
927         ror     w19,w19,#16
928         eor     v19.16b,v19.16b,v16.16b
929         ror     w20,w20,#16
930         eor     v23.16b,v23.16b,v20.16b
931         ror     w21,w21,#16
932         rev32   v3.8h,v3.8h
933         add     w13,w13,w17
934         rev32   v7.8h,v7.8h
935         add     w14,w14,w19
936         rev32   v11.8h,v11.8h
937         add     w15,w15,w20
938         rev32   v15.8h,v15.8h
939         add     w16,w16,w21
940         rev32   v19.8h,v19.8h
941         eor     w9,w9,w13
942         rev32   v23.8h,v23.8h
943         eor     w10,w10,w14
944         add     v2.4s,v2.4s,v3.4s
945         eor     w11,w11,w15
946         add     v6.4s,v6.4s,v7.4s
947         eor     w12,w12,w16
948         add     v10.4s,v10.4s,v11.4s
949         ror     w9,w9,#20
950         add     v14.4s,v14.4s,v15.4s
951         ror     w10,w10,#20
952         add     v18.4s,v18.4s,v19.4s
953         ror     w11,w11,#20
954         add     v22.4s,v22.4s,v23.4s
955         ror     w12,w12,#20
956         eor     v24.16b,v1.16b,v2.16b
957         add     w5,w5,w9
958         eor     v25.16b,v5.16b,v6.16b
959         add     w6,w6,w10
960         eor     v26.16b,v9.16b,v10.16b
961         add     w7,w7,w11
962         eor     v27.16b,v13.16b,v14.16b
963         add     w8,w8,w12
964         eor     v28.16b,v17.16b,v18.16b
965         eor     w17,w17,w5
966         eor     v29.16b,v21.16b,v22.16b
967         eor     w19,w19,w6
968         ushr    v1.4s,v24.4s,#20
969         eor     w20,w20,w7
970         ushr    v5.4s,v25.4s,#20
971         eor     w21,w21,w8
972         ushr    v9.4s,v26.4s,#20
973         ror     w17,w17,#24
974         ushr    v13.4s,v27.4s,#20
975         ror     w19,w19,#24
976         ushr    v17.4s,v28.4s,#20
977         ror     w20,w20,#24
978         ushr    v21.4s,v29.4s,#20
979         ror     w21,w21,#24
980         sli     v1.4s,v24.4s,#12
981         add     w13,w13,w17
982         sli     v5.4s,v25.4s,#12
983         add     w14,w14,w19
984         sli     v9.4s,v26.4s,#12
985         add     w15,w15,w20
986         sli     v13.4s,v27.4s,#12
987         add     w16,w16,w21
988         sli     v17.4s,v28.4s,#12
989         eor     w9,w9,w13
990         sli     v21.4s,v29.4s,#12
991         eor     w10,w10,w14
992         add     v0.4s,v0.4s,v1.4s
993         eor     w11,w11,w15
994         add     v4.4s,v4.4s,v5.4s
995         eor     w12,w12,w16
996         add     v8.4s,v8.4s,v9.4s
997         ror     w9,w9,#25
998         add     v12.4s,v12.4s,v13.4s
999         ror     w10,w10,#25
1000         add     v16.4s,v16.4s,v17.4s
1001         ror     w11,w11,#25
1002         add     v20.4s,v20.4s,v21.4s
1003         ror     w12,w12,#25
1004         eor     v24.16b,v3.16b,v0.16b
1005         add     w5,w5,w10
1006         eor     v25.16b,v7.16b,v4.16b
1007         add     w6,w6,w11
1008         eor     v26.16b,v11.16b,v8.16b
1009         add     w7,w7,w12
1010         eor     v27.16b,v15.16b,v12.16b
1011         add     w8,w8,w9
1012         eor     v28.16b,v19.16b,v16.16b
1013         eor     w21,w21,w5
1014         eor     v29.16b,v23.16b,v20.16b
1015         eor     w17,w17,w6
1016         ushr    v3.4s,v24.4s,#24
1017         eor     w19,w19,w7
1018         ushr    v7.4s,v25.4s,#24
1019         eor     w20,w20,w8
1020         ushr    v11.4s,v26.4s,#24
1021         ror     w21,w21,#16
1022         ushr    v15.4s,v27.4s,#24
1023         ror     w17,w17,#16
1024         ushr    v19.4s,v28.4s,#24
1025         ror     w19,w19,#16
1026         ushr    v23.4s,v29.4s,#24
1027         ror     w20,w20,#16
1028         sli     v3.4s,v24.4s,#8
1029         add     w15,w15,w21
1030         sli     v7.4s,v25.4s,#8
1031         add     w16,w16,w17
1032         sli     v11.4s,v26.4s,#8
1033         add     w13,w13,w19
1034         sli     v15.4s,v27.4s,#8
1035         add     w14,w14,w20
1036         sli     v19.4s,v28.4s,#8
1037         eor     w10,w10,w15
1038         sli     v23.4s,v29.4s,#8
1039         eor     w11,w11,w16
1040         add     v2.4s,v2.4s,v3.4s
1041         eor     w12,w12,w13
1042         add     v6.4s,v6.4s,v7.4s
1043         eor     w9,w9,w14
1044         add     v10.4s,v10.4s,v11.4s
1045         ror     w10,w10,#20
1046         add     v14.4s,v14.4s,v15.4s
1047         ror     w11,w11,#20
1048         add     v18.4s,v18.4s,v19.4s
1049         ror     w12,w12,#20
1050         add     v22.4s,v22.4s,v23.4s
1051         ror     w9,w9,#20
1052         eor     v24.16b,v1.16b,v2.16b
1053         add     w5,w5,w10
1054         eor     v25.16b,v5.16b,v6.16b
1055         add     w6,w6,w11
1056         eor     v26.16b,v9.16b,v10.16b
1057         add     w7,w7,w12
1058         eor     v27.16b,v13.16b,v14.16b
1059         add     w8,w8,w9
1060         eor     v28.16b,v17.16b,v18.16b
1061         eor     w21,w21,w5
1062         eor     v29.16b,v21.16b,v22.16b
1063         eor     w17,w17,w6
1064         ushr    v1.4s,v24.4s,#25
1065         eor     w19,w19,w7
1066         ushr    v5.4s,v25.4s,#25
1067         eor     w20,w20,w8
1068         ushr    v9.4s,v26.4s,#25
1069         ror     w21,w21,#24
1070         ushr    v13.4s,v27.4s,#25
1071         ror     w17,w17,#24
1072         ushr    v17.4s,v28.4s,#25
1073         ror     w19,w19,#24
1074         ushr    v21.4s,v29.4s,#25
1075         ror     w20,w20,#24
1076         sli     v1.4s,v24.4s,#7
1077         add     w15,w15,w21
1078         sli     v5.4s,v25.4s,#7
1079         add     w16,w16,w17
1080         sli     v9.4s,v26.4s,#7
1081         add     w13,w13,w19
1082         sli     v13.4s,v27.4s,#7
1083         add     w14,w14,w20
1084         sli     v17.4s,v28.4s,#7
1085         eor     w10,w10,w15
1086         sli     v21.4s,v29.4s,#7
1087         eor     w11,w11,w16
1088         ext     v2.16b,v2.16b,v2.16b,#8
1089         eor     w12,w12,w13
1090         ext     v6.16b,v6.16b,v6.16b,#8
1091         eor     w9,w9,w14
1092         ext     v10.16b,v10.16b,v10.16b,#8
1093         ror     w10,w10,#25
1094         ext     v14.16b,v14.16b,v14.16b,#8
1095         ror     w11,w11,#25
1096         ext     v18.16b,v18.16b,v18.16b,#8
1097         ror     w12,w12,#25
1098         ext     v22.16b,v22.16b,v22.16b,#8
1099         ror     w9,w9,#25
1100         ext     v3.16b,v3.16b,v3.16b,#12
1101         ext     v7.16b,v7.16b,v7.16b,#12
1102         ext     v11.16b,v11.16b,v11.16b,#12
1103         ext     v15.16b,v15.16b,v15.16b,#12
1104         ext     v19.16b,v19.16b,v19.16b,#12
1105         ext     v23.16b,v23.16b,v23.16b,#12
1106         ext     v1.16b,v1.16b,v1.16b,#4
1107         ext     v5.16b,v5.16b,v5.16b,#4
1108         ext     v9.16b,v9.16b,v9.16b,#4
1109         ext     v13.16b,v13.16b,v13.16b,#4
1110         ext     v17.16b,v17.16b,v17.16b,#4
1111         ext     v21.16b,v21.16b,v21.16b,#4
1112         add     v0.4s,v0.4s,v1.4s
1113         add     w5,w5,w9
1114         add     v4.4s,v4.4s,v5.4s
1115         add     w6,w6,w10
1116         add     v8.4s,v8.4s,v9.4s
1117         add     w7,w7,w11
1118         add     v12.4s,v12.4s,v13.4s
1119         add     w8,w8,w12
1120         add     v16.4s,v16.4s,v17.4s
1121         eor     w17,w17,w5
1122         add     v20.4s,v20.4s,v21.4s
1123         eor     w19,w19,w6
1124         eor     v3.16b,v3.16b,v0.16b
1125         eor     w20,w20,w7
1126         eor     v7.16b,v7.16b,v4.16b
1127         eor     w21,w21,w8
1128         eor     v11.16b,v11.16b,v8.16b
1129         ror     w17,w17,#16
1130         eor     v15.16b,v15.16b,v12.16b
1131         ror     w19,w19,#16
1132         eor     v19.16b,v19.16b,v16.16b
1133         ror     w20,w20,#16
1134         eor     v23.16b,v23.16b,v20.16b
1135         ror     w21,w21,#16
1136         rev32   v3.8h,v3.8h
1137         add     w13,w13,w17
1138         rev32   v7.8h,v7.8h
1139         add     w14,w14,w19
1140         rev32   v11.8h,v11.8h
1141         add     w15,w15,w20
1142         rev32   v15.8h,v15.8h
1143         add     w16,w16,w21
1144         rev32   v19.8h,v19.8h
1145         eor     w9,w9,w13
1146         rev32   v23.8h,v23.8h
1147         eor     w10,w10,w14
1148         add     v2.4s,v2.4s,v3.4s
1149         eor     w11,w11,w15
1150         add     v6.4s,v6.4s,v7.4s
1151         eor     w12,w12,w16
1152         add     v10.4s,v10.4s,v11.4s
1153         ror     w9,w9,#20
1154         add     v14.4s,v14.4s,v15.4s
1155         ror     w10,w10,#20
1156         add     v18.4s,v18.4s,v19.4s
1157         ror     w11,w11,#20
1158         add     v22.4s,v22.4s,v23.4s
1159         ror     w12,w12,#20
1160         eor     v24.16b,v1.16b,v2.16b
1161         add     w5,w5,w9
1162         eor     v25.16b,v5.16b,v6.16b
1163         add     w6,w6,w10
1164         eor     v26.16b,v9.16b,v10.16b
1165         add     w7,w7,w11
1166         eor     v27.16b,v13.16b,v14.16b
1167         add     w8,w8,w12
1168         eor     v28.16b,v17.16b,v18.16b
1169         eor     w17,w17,w5
1170         eor     v29.16b,v21.16b,v22.16b
1171         eor     w19,w19,w6
1172         ushr    v1.4s,v24.4s,#20
1173         eor     w20,w20,w7
1174         ushr    v5.4s,v25.4s,#20
1175         eor     w21,w21,w8
1176         ushr    v9.4s,v26.4s,#20
1177         ror     w17,w17,#24
1178         ushr    v13.4s,v27.4s,#20
1179         ror     w19,w19,#24
1180         ushr    v17.4s,v28.4s,#20
1181         ror     w20,w20,#24
1182         ushr    v21.4s,v29.4s,#20
1183         ror     w21,w21,#24
1184         sli     v1.4s,v24.4s,#12
1185         add     w13,w13,w17
1186         sli     v5.4s,v25.4s,#12
1187         add     w14,w14,w19
1188         sli     v9.4s,v26.4s,#12
1189         add     w15,w15,w20
1190         sli     v13.4s,v27.4s,#12
1191         add     w16,w16,w21
1192         sli     v17.4s,v28.4s,#12
1193         eor     w9,w9,w13
1194         sli     v21.4s,v29.4s,#12
1195         eor     w10,w10,w14
1196         add     v0.4s,v0.4s,v1.4s
1197         eor     w11,w11,w15
1198         add     v4.4s,v4.4s,v5.4s
1199         eor     w12,w12,w16
1200         add     v8.4s,v8.4s,v9.4s
1201         ror     w9,w9,#25
1202         add     v12.4s,v12.4s,v13.4s
1203         ror     w10,w10,#25
1204         add     v16.4s,v16.4s,v17.4s
1205         ror     w11,w11,#25
1206         add     v20.4s,v20.4s,v21.4s
1207         ror     w12,w12,#25
1208         eor     v24.16b,v3.16b,v0.16b
1209         add     w5,w5,w10
1210         eor     v25.16b,v7.16b,v4.16b
1211         add     w6,w6,w11
1212         eor     v26.16b,v11.16b,v8.16b
1213         add     w7,w7,w12
1214         eor     v27.16b,v15.16b,v12.16b
1215         add     w8,w8,w9
1216         eor     v28.16b,v19.16b,v16.16b
1217         eor     w21,w21,w5
1218         eor     v29.16b,v23.16b,v20.16b
1219         eor     w17,w17,w6
1220         ushr    v3.4s,v24.4s,#24
1221         eor     w19,w19,w7
1222         ushr    v7.4s,v25.4s,#24
1223         eor     w20,w20,w8
1224         ushr    v11.4s,v26.4s,#24
1225         ror     w21,w21,#16
1226         ushr    v15.4s,v27.4s,#24
1227         ror     w17,w17,#16
1228         ushr    v19.4s,v28.4s,#24
1229         ror     w19,w19,#16
1230         ushr    v23.4s,v29.4s,#24
1231         ror     w20,w20,#16
1232         sli     v3.4s,v24.4s,#8
1233         add     w15,w15,w21
1234         sli     v7.4s,v25.4s,#8
1235         add     w16,w16,w17
1236         sli     v11.4s,v26.4s,#8
1237         add     w13,w13,w19
1238         sli     v15.4s,v27.4s,#8
1239         add     w14,w14,w20
1240         sli     v19.4s,v28.4s,#8
1241         eor     w10,w10,w15
1242         sli     v23.4s,v29.4s,#8
1243         eor     w11,w11,w16
1244         add     v2.4s,v2.4s,v3.4s
1245         eor     w12,w12,w13
1246         add     v6.4s,v6.4s,v7.4s
1247         eor     w9,w9,w14
1248         add     v10.4s,v10.4s,v11.4s
1249         ror     w10,w10,#20
1250         add     v14.4s,v14.4s,v15.4s
1251         ror     w11,w11,#20
1252         add     v18.4s,v18.4s,v19.4s
1253         ror     w12,w12,#20
1254         add     v22.4s,v22.4s,v23.4s
1255         ror     w9,w9,#20
1256         eor     v24.16b,v1.16b,v2.16b
1257         add     w5,w5,w10
1258         eor     v25.16b,v5.16b,v6.16b
1259         add     w6,w6,w11
1260         eor     v26.16b,v9.16b,v10.16b
1261         add     w7,w7,w12
1262         eor     v27.16b,v13.16b,v14.16b
1263         add     w8,w8,w9
1264         eor     v28.16b,v17.16b,v18.16b
1265         eor     w21,w21,w5
1266         eor     v29.16b,v21.16b,v22.16b
1267         eor     w17,w17,w6
1268         ushr    v1.4s,v24.4s,#25
1269         eor     w19,w19,w7
1270         ushr    v5.4s,v25.4s,#25
1271         eor     w20,w20,w8
1272         ushr    v9.4s,v26.4s,#25
1273         ror     w21,w21,#24
1274         ushr    v13.4s,v27.4s,#25
1275         ror     w17,w17,#24
1276         ushr    v17.4s,v28.4s,#25
1277         ror     w19,w19,#24
1278         ushr    v21.4s,v29.4s,#25
1279         ror     w20,w20,#24
1280         sli     v1.4s,v24.4s,#7
1281         add     w15,w15,w21
1282         sli     v5.4s,v25.4s,#7
1283         add     w16,w16,w17
1284         sli     v9.4s,v26.4s,#7
1285         add     w13,w13,w19
1286         sli     v13.4s,v27.4s,#7
1287         add     w14,w14,w20
1288         sli     v17.4s,v28.4s,#7
1289         eor     w10,w10,w15
1290         sli     v21.4s,v29.4s,#7
1291         eor     w11,w11,w16
1292         ext     v2.16b,v2.16b,v2.16b,#8
1293         eor     w12,w12,w13
1294         ext     v6.16b,v6.16b,v6.16b,#8
1295         eor     w9,w9,w14
1296         ext     v10.16b,v10.16b,v10.16b,#8
1297         ror     w10,w10,#25
1298         ext     v14.16b,v14.16b,v14.16b,#8
1299         ror     w11,w11,#25
1300         ext     v18.16b,v18.16b,v18.16b,#8
1301         ror     w12,w12,#25
1302         ext     v22.16b,v22.16b,v22.16b,#8
1303         ror     w9,w9,#25
1304         ext     v3.16b,v3.16b,v3.16b,#4
1305         ext     v7.16b,v7.16b,v7.16b,#4
1306         ext     v11.16b,v11.16b,v11.16b,#4
1307         ext     v15.16b,v15.16b,v15.16b,#4
1308         ext     v19.16b,v19.16b,v19.16b,#4
1309         ext     v23.16b,v23.16b,v23.16b,#4
1310         ext     v1.16b,v1.16b,v1.16b,#12
1311         ext     v5.16b,v5.16b,v5.16b,#12
1312         ext     v9.16b,v9.16b,v9.16b,#12
1313         ext     v13.16b,v13.16b,v13.16b,#12
1314         ext     v17.16b,v17.16b,v17.16b,#12
1315         ext     v21.16b,v21.16b,v21.16b,#12
1316         cbnz    x4,.Loop_upper_neon
1317
1318         add     w5,w5,w22               // accumulate key block
1319         add     x6,x6,x22,lsr#32
1320         add     w7,w7,w23
1321         add     x8,x8,x23,lsr#32
1322         add     w9,w9,w24
1323         add     x10,x10,x24,lsr#32
1324         add     w11,w11,w25
1325         add     x12,x12,x25,lsr#32
1326         add     w13,w13,w26
1327         add     x14,x14,x26,lsr#32
1328         add     w15,w15,w27
1329         add     x16,x16,x27,lsr#32
1330         add     w17,w17,w28
1331         add     x19,x19,x28,lsr#32
1332         add     w20,w20,w30
1333         add     x21,x21,x30,lsr#32
1334
1335         add     x5,x5,x6,lsl#32 // pack
1336         add     x7,x7,x8,lsl#32
1337         ldp     x6,x8,[x1,#0]           // load input
1338         add     x9,x9,x10,lsl#32
1339         add     x11,x11,x12,lsl#32
1340         ldp     x10,x12,[x1,#16]
1341         add     x13,x13,x14,lsl#32
1342         add     x15,x15,x16,lsl#32
1343         ldp     x14,x16,[x1,#32]
1344         add     x17,x17,x19,lsl#32
1345         add     x20,x20,x21,lsl#32
1346         ldp     x19,x21,[x1,#48]
1347         add     x1,x1,#64
1348 #ifdef  __ARMEB__
1349         rev     x5,x5
1350         rev     x7,x7
1351         rev     x9,x9
1352         rev     x11,x11
1353         rev     x13,x13
1354         rev     x15,x15
1355         rev     x17,x17
1356         rev     x20,x20
1357 #endif
1358         eor     x5,x5,x6
1359         eor     x7,x7,x8
1360         eor     x9,x9,x10
1361         eor     x11,x11,x12
1362         eor     x13,x13,x14
1363         eor     x15,x15,x16
1364         eor     x17,x17,x19
1365         eor     x20,x20,x21
1366
1367         stp     x5,x7,[x0,#0]           // store output
1368         add     x28,x28,#1                      // increment counter
1369         mov     w5,w22                  // unpack key block
1370         lsr     x6,x22,#32
1371         stp     x9,x11,[x0,#16]
1372         mov     w7,w23
1373         lsr     x8,x23,#32
1374         stp     x13,x15,[x0,#32]
1375         mov     w9,w24
1376         lsr     x10,x24,#32
1377         stp     x17,x20,[x0,#48]
1378         add     x0,x0,#64
1379         mov     w11,w25
1380         lsr     x12,x25,#32
1381         mov     w13,w26
1382         lsr     x14,x26,#32
1383         mov     w15,w27
1384         lsr     x16,x27,#32
1385         mov     w17,w28
1386         lsr     x19,x28,#32
1387         mov     w20,w30
1388         lsr     x21,x30,#32
1389
1390         mov     x4,#5
1391 .Loop_lower_neon:
1392         sub     x4,x4,#1
1393         add     v0.4s,v0.4s,v1.4s
1394         add     w5,w5,w9
1395         add     v4.4s,v4.4s,v5.4s
1396         add     w6,w6,w10
1397         add     v8.4s,v8.4s,v9.4s
1398         add     w7,w7,w11
1399         add     v12.4s,v12.4s,v13.4s
1400         add     w8,w8,w12
1401         add     v16.4s,v16.4s,v17.4s
1402         eor     w17,w17,w5
1403         add     v20.4s,v20.4s,v21.4s
1404         eor     w19,w19,w6
1405         eor     v3.16b,v3.16b,v0.16b
1406         eor     w20,w20,w7
1407         eor     v7.16b,v7.16b,v4.16b
1408         eor     w21,w21,w8
1409         eor     v11.16b,v11.16b,v8.16b
1410         ror     w17,w17,#16
1411         eor     v15.16b,v15.16b,v12.16b
1412         ror     w19,w19,#16
1413         eor     v19.16b,v19.16b,v16.16b
1414         ror     w20,w20,#16
1415         eor     v23.16b,v23.16b,v20.16b
1416         ror     w21,w21,#16
1417         rev32   v3.8h,v3.8h
1418         add     w13,w13,w17
1419         rev32   v7.8h,v7.8h
1420         add     w14,w14,w19
1421         rev32   v11.8h,v11.8h
1422         add     w15,w15,w20
1423         rev32   v15.8h,v15.8h
1424         add     w16,w16,w21
1425         rev32   v19.8h,v19.8h
1426         eor     w9,w9,w13
1427         rev32   v23.8h,v23.8h
1428         eor     w10,w10,w14
1429         add     v2.4s,v2.4s,v3.4s
1430         eor     w11,w11,w15
1431         add     v6.4s,v6.4s,v7.4s
1432         eor     w12,w12,w16
1433         add     v10.4s,v10.4s,v11.4s
1434         ror     w9,w9,#20
1435         add     v14.4s,v14.4s,v15.4s
1436         ror     w10,w10,#20
1437         add     v18.4s,v18.4s,v19.4s
1438         ror     w11,w11,#20
1439         add     v22.4s,v22.4s,v23.4s
1440         ror     w12,w12,#20
1441         eor     v24.16b,v1.16b,v2.16b
1442         add     w5,w5,w9
1443         eor     v25.16b,v5.16b,v6.16b
1444         add     w6,w6,w10
1445         eor     v26.16b,v9.16b,v10.16b
1446         add     w7,w7,w11
1447         eor     v27.16b,v13.16b,v14.16b
1448         add     w8,w8,w12
1449         eor     v28.16b,v17.16b,v18.16b
1450         eor     w17,w17,w5
1451         eor     v29.16b,v21.16b,v22.16b
1452         eor     w19,w19,w6
1453         ushr    v1.4s,v24.4s,#20
1454         eor     w20,w20,w7
1455         ushr    v5.4s,v25.4s,#20
1456         eor     w21,w21,w8
1457         ushr    v9.4s,v26.4s,#20
1458         ror     w17,w17,#24
1459         ushr    v13.4s,v27.4s,#20
1460         ror     w19,w19,#24
1461         ushr    v17.4s,v28.4s,#20
1462         ror     w20,w20,#24
1463         ushr    v21.4s,v29.4s,#20
1464         ror     w21,w21,#24
1465         sli     v1.4s,v24.4s,#12
1466         add     w13,w13,w17
1467         sli     v5.4s,v25.4s,#12
1468         add     w14,w14,w19
1469         sli     v9.4s,v26.4s,#12
1470         add     w15,w15,w20
1471         sli     v13.4s,v27.4s,#12
1472         add     w16,w16,w21
1473         sli     v17.4s,v28.4s,#12
1474         eor     w9,w9,w13
1475         sli     v21.4s,v29.4s,#12
1476         eor     w10,w10,w14
1477         add     v0.4s,v0.4s,v1.4s
1478         eor     w11,w11,w15
1479         add     v4.4s,v4.4s,v5.4s
1480         eor     w12,w12,w16
1481         add     v8.4s,v8.4s,v9.4s
1482         ror     w9,w9,#25
1483         add     v12.4s,v12.4s,v13.4s
1484         ror     w10,w10,#25
1485         add     v16.4s,v16.4s,v17.4s
1486         ror     w11,w11,#25
1487         add     v20.4s,v20.4s,v21.4s
1488         ror     w12,w12,#25
1489         eor     v24.16b,v3.16b,v0.16b
1490         add     w5,w5,w10
1491         eor     v25.16b,v7.16b,v4.16b
1492         add     w6,w6,w11
1493         eor     v26.16b,v11.16b,v8.16b
1494         add     w7,w7,w12
1495         eor     v27.16b,v15.16b,v12.16b
1496         add     w8,w8,w9
1497         eor     v28.16b,v19.16b,v16.16b
1498         eor     w21,w21,w5
1499         eor     v29.16b,v23.16b,v20.16b
1500         eor     w17,w17,w6
1501         ushr    v3.4s,v24.4s,#24
1502         eor     w19,w19,w7
1503         ushr    v7.4s,v25.4s,#24
1504         eor     w20,w20,w8
1505         ushr    v11.4s,v26.4s,#24
1506         ror     w21,w21,#16
1507         ushr    v15.4s,v27.4s,#24
1508         ror     w17,w17,#16
1509         ushr    v19.4s,v28.4s,#24
1510         ror     w19,w19,#16
1511         ushr    v23.4s,v29.4s,#24
1512         ror     w20,w20,#16
1513         sli     v3.4s,v24.4s,#8
1514         add     w15,w15,w21
1515         sli     v7.4s,v25.4s,#8
1516         add     w16,w16,w17
1517         sli     v11.4s,v26.4s,#8
1518         add     w13,w13,w19
1519         sli     v15.4s,v27.4s,#8
1520         add     w14,w14,w20
1521         sli     v19.4s,v28.4s,#8
1522         eor     w10,w10,w15
1523         sli     v23.4s,v29.4s,#8
1524         eor     w11,w11,w16
1525         add     v2.4s,v2.4s,v3.4s
1526         eor     w12,w12,w13
1527         add     v6.4s,v6.4s,v7.4s
1528         eor     w9,w9,w14
1529         add     v10.4s,v10.4s,v11.4s
1530         ror     w10,w10,#20
1531         add     v14.4s,v14.4s,v15.4s
1532         ror     w11,w11,#20
1533         add     v18.4s,v18.4s,v19.4s
1534         ror     w12,w12,#20
1535         add     v22.4s,v22.4s,v23.4s
1536         ror     w9,w9,#20
1537         eor     v24.16b,v1.16b,v2.16b
1538         add     w5,w5,w10
1539         eor     v25.16b,v5.16b,v6.16b
1540         add     w6,w6,w11
1541         eor     v26.16b,v9.16b,v10.16b
1542         add     w7,w7,w12
1543         eor     v27.16b,v13.16b,v14.16b
1544         add     w8,w8,w9
1545         eor     v28.16b,v17.16b,v18.16b
1546         eor     w21,w21,w5
1547         eor     v29.16b,v21.16b,v22.16b
1548         eor     w17,w17,w6
1549         ushr    v1.4s,v24.4s,#25
1550         eor     w19,w19,w7
1551         ushr    v5.4s,v25.4s,#25
1552         eor     w20,w20,w8
1553         ushr    v9.4s,v26.4s,#25
1554         ror     w21,w21,#24
1555         ushr    v13.4s,v27.4s,#25
1556         ror     w17,w17,#24
1557         ushr    v17.4s,v28.4s,#25
1558         ror     w19,w19,#24
1559         ushr    v21.4s,v29.4s,#25
1560         ror     w20,w20,#24
1561         sli     v1.4s,v24.4s,#7
1562         add     w15,w15,w21
1563         sli     v5.4s,v25.4s,#7
1564         add     w16,w16,w17
1565         sli     v9.4s,v26.4s,#7
1566         add     w13,w13,w19
1567         sli     v13.4s,v27.4s,#7
1568         add     w14,w14,w20
1569         sli     v17.4s,v28.4s,#7
1570         eor     w10,w10,w15
1571         sli     v21.4s,v29.4s,#7
1572         eor     w11,w11,w16
1573         ext     v2.16b,v2.16b,v2.16b,#8
1574         eor     w12,w12,w13
1575         ext     v6.16b,v6.16b,v6.16b,#8
1576         eor     w9,w9,w14
1577         ext     v10.16b,v10.16b,v10.16b,#8
1578         ror     w10,w10,#25
1579         ext     v14.16b,v14.16b,v14.16b,#8
1580         ror     w11,w11,#25
1581         ext     v18.16b,v18.16b,v18.16b,#8
1582         ror     w12,w12,#25
1583         ext     v22.16b,v22.16b,v22.16b,#8
1584         ror     w9,w9,#25
1585         ext     v3.16b,v3.16b,v3.16b,#12
1586         ext     v7.16b,v7.16b,v7.16b,#12
1587         ext     v11.16b,v11.16b,v11.16b,#12
1588         ext     v15.16b,v15.16b,v15.16b,#12
1589         ext     v19.16b,v19.16b,v19.16b,#12
1590         ext     v23.16b,v23.16b,v23.16b,#12
1591         ext     v1.16b,v1.16b,v1.16b,#4
1592         ext     v5.16b,v5.16b,v5.16b,#4
1593         ext     v9.16b,v9.16b,v9.16b,#4
1594         ext     v13.16b,v13.16b,v13.16b,#4
1595         ext     v17.16b,v17.16b,v17.16b,#4
1596         ext     v21.16b,v21.16b,v21.16b,#4
1597         add     v0.4s,v0.4s,v1.4s
1598         add     w5,w5,w9
1599         add     v4.4s,v4.4s,v5.4s
1600         add     w6,w6,w10
1601         add     v8.4s,v8.4s,v9.4s
1602         add     w7,w7,w11
1603         add     v12.4s,v12.4s,v13.4s
1604         add     w8,w8,w12
1605         add     v16.4s,v16.4s,v17.4s
1606         eor     w17,w17,w5
1607         add     v20.4s,v20.4s,v21.4s
1608         eor     w19,w19,w6
1609         eor     v3.16b,v3.16b,v0.16b
1610         eor     w20,w20,w7
1611         eor     v7.16b,v7.16b,v4.16b
1612         eor     w21,w21,w8
1613         eor     v11.16b,v11.16b,v8.16b
1614         ror     w17,w17,#16
1615         eor     v15.16b,v15.16b,v12.16b
1616         ror     w19,w19,#16
1617         eor     v19.16b,v19.16b,v16.16b
1618         ror     w20,w20,#16
1619         eor     v23.16b,v23.16b,v20.16b
1620         ror     w21,w21,#16
1621         rev32   v3.8h,v3.8h
1622         add     w13,w13,w17
1623         rev32   v7.8h,v7.8h
1624         add     w14,w14,w19
1625         rev32   v11.8h,v11.8h
1626         add     w15,w15,w20
1627         rev32   v15.8h,v15.8h
1628         add     w16,w16,w21
1629         rev32   v19.8h,v19.8h
1630         eor     w9,w9,w13
1631         rev32   v23.8h,v23.8h
1632         eor     w10,w10,w14
1633         add     v2.4s,v2.4s,v3.4s
1634         eor     w11,w11,w15
1635         add     v6.4s,v6.4s,v7.4s
1636         eor     w12,w12,w16
1637         add     v10.4s,v10.4s,v11.4s
1638         ror     w9,w9,#20
1639         add     v14.4s,v14.4s,v15.4s
1640         ror     w10,w10,#20
1641         add     v18.4s,v18.4s,v19.4s
1642         ror     w11,w11,#20
1643         add     v22.4s,v22.4s,v23.4s
1644         ror     w12,w12,#20
1645         eor     v24.16b,v1.16b,v2.16b
1646         add     w5,w5,w9
1647         eor     v25.16b,v5.16b,v6.16b
1648         add     w6,w6,w10
1649         eor     v26.16b,v9.16b,v10.16b
1650         add     w7,w7,w11
1651         eor     v27.16b,v13.16b,v14.16b
1652         add     w8,w8,w12
1653         eor     v28.16b,v17.16b,v18.16b
1654         eor     w17,w17,w5
1655         eor     v29.16b,v21.16b,v22.16b
1656         eor     w19,w19,w6
1657         ushr    v1.4s,v24.4s,#20
1658         eor     w20,w20,w7
1659         ushr    v5.4s,v25.4s,#20
1660         eor     w21,w21,w8
1661         ushr    v9.4s,v26.4s,#20
1662         ror     w17,w17,#24
1663         ushr    v13.4s,v27.4s,#20
1664         ror     w19,w19,#24
1665         ushr    v17.4s,v28.4s,#20
1666         ror     w20,w20,#24
1667         ushr    v21.4s,v29.4s,#20
1668         ror     w21,w21,#24
1669         sli     v1.4s,v24.4s,#12
1670         add     w13,w13,w17
1671         sli     v5.4s,v25.4s,#12
1672         add     w14,w14,w19
1673         sli     v9.4s,v26.4s,#12
1674         add     w15,w15,w20
1675         sli     v13.4s,v27.4s,#12
1676         add     w16,w16,w21
1677         sli     v17.4s,v28.4s,#12
1678         eor     w9,w9,w13
1679         sli     v21.4s,v29.4s,#12
1680         eor     w10,w10,w14
1681         add     v0.4s,v0.4s,v1.4s
1682         eor     w11,w11,w15
1683         add     v4.4s,v4.4s,v5.4s
1684         eor     w12,w12,w16
1685         add     v8.4s,v8.4s,v9.4s
1686         ror     w9,w9,#25
1687         add     v12.4s,v12.4s,v13.4s
1688         ror     w10,w10,#25
1689         add     v16.4s,v16.4s,v17.4s
1690         ror     w11,w11,#25
1691         add     v20.4s,v20.4s,v21.4s
1692         ror     w12,w12,#25
1693         eor     v24.16b,v3.16b,v0.16b
1694         add     w5,w5,w10
1695         eor     v25.16b,v7.16b,v4.16b
1696         add     w6,w6,w11
1697         eor     v26.16b,v11.16b,v8.16b
1698         add     w7,w7,w12
1699         eor     v27.16b,v15.16b,v12.16b
1700         add     w8,w8,w9
1701         eor     v28.16b,v19.16b,v16.16b
1702         eor     w21,w21,w5
1703         eor     v29.16b,v23.16b,v20.16b
1704         eor     w17,w17,w6
1705         ushr    v3.4s,v24.4s,#24
1706         eor     w19,w19,w7
1707         ushr    v7.4s,v25.4s,#24
1708         eor     w20,w20,w8
1709         ushr    v11.4s,v26.4s,#24
1710         ror     w21,w21,#16
1711         ushr    v15.4s,v27.4s,#24
1712         ror     w17,w17,#16
1713         ushr    v19.4s,v28.4s,#24
1714         ror     w19,w19,#16
1715         ushr    v23.4s,v29.4s,#24
1716         ror     w20,w20,#16
1717         sli     v3.4s,v24.4s,#8
1718         add     w15,w15,w21
1719         sli     v7.4s,v25.4s,#8
1720         add     w16,w16,w17
1721         sli     v11.4s,v26.4s,#8
1722         add     w13,w13,w19
1723         sli     v15.4s,v27.4s,#8
1724         add     w14,w14,w20
1725         sli     v19.4s,v28.4s,#8
1726         eor     w10,w10,w15
1727         sli     v23.4s,v29.4s,#8
1728         eor     w11,w11,w16
1729         add     v2.4s,v2.4s,v3.4s
1730         eor     w12,w12,w13
1731         add     v6.4s,v6.4s,v7.4s
1732         eor     w9,w9,w14
1733         add     v10.4s,v10.4s,v11.4s
1734         ror     w10,w10,#20
1735         add     v14.4s,v14.4s,v15.4s
1736         ror     w11,w11,#20
1737         add     v18.4s,v18.4s,v19.4s
1738         ror     w12,w12,#20
1739         add     v22.4s,v22.4s,v23.4s
1740         ror     w9,w9,#20
1741         eor     v24.16b,v1.16b,v2.16b
1742         add     w5,w5,w10
1743         eor     v25.16b,v5.16b,v6.16b
1744         add     w6,w6,w11
1745         eor     v26.16b,v9.16b,v10.16b
1746         add     w7,w7,w12
1747         eor     v27.16b,v13.16b,v14.16b
1748         add     w8,w8,w9
1749         eor     v28.16b,v17.16b,v18.16b
1750         eor     w21,w21,w5
1751         eor     v29.16b,v21.16b,v22.16b
1752         eor     w17,w17,w6
1753         ushr    v1.4s,v24.4s,#25
1754         eor     w19,w19,w7
1755         ushr    v5.4s,v25.4s,#25
1756         eor     w20,w20,w8
1757         ushr    v9.4s,v26.4s,#25
1758         ror     w21,w21,#24
1759         ushr    v13.4s,v27.4s,#25
1760         ror     w17,w17,#24
1761         ushr    v17.4s,v28.4s,#25
1762         ror     w19,w19,#24
1763         ushr    v21.4s,v29.4s,#25
1764         ror     w20,w20,#24
1765         sli     v1.4s,v24.4s,#7
1766         add     w15,w15,w21
1767         sli     v5.4s,v25.4s,#7
1768         add     w16,w16,w17
1769         sli     v9.4s,v26.4s,#7
1770         add     w13,w13,w19
1771         sli     v13.4s,v27.4s,#7
1772         add     w14,w14,w20
1773         sli     v17.4s,v28.4s,#7
1774         eor     w10,w10,w15
1775         sli     v21.4s,v29.4s,#7
1776         eor     w11,w11,w16
1777         ext     v2.16b,v2.16b,v2.16b,#8
1778         eor     w12,w12,w13
1779         ext     v6.16b,v6.16b,v6.16b,#8
1780         eor     w9,w9,w14
1781         ext     v10.16b,v10.16b,v10.16b,#8
1782         ror     w10,w10,#25
1783         ext     v14.16b,v14.16b,v14.16b,#8
1784         ror     w11,w11,#25
1785         ext     v18.16b,v18.16b,v18.16b,#8
1786         ror     w12,w12,#25
1787         ext     v22.16b,v22.16b,v22.16b,#8
1788         ror     w9,w9,#25
1789         ext     v3.16b,v3.16b,v3.16b,#4
1790         ext     v7.16b,v7.16b,v7.16b,#4
1791         ext     v11.16b,v11.16b,v11.16b,#4
1792         ext     v15.16b,v15.16b,v15.16b,#4
1793         ext     v19.16b,v19.16b,v19.16b,#4
1794         ext     v23.16b,v23.16b,v23.16b,#4
1795         ext     v1.16b,v1.16b,v1.16b,#12
1796         ext     v5.16b,v5.16b,v5.16b,#12
1797         ext     v9.16b,v9.16b,v9.16b,#12
1798         ext     v13.16b,v13.16b,v13.16b,#12
1799         ext     v17.16b,v17.16b,v17.16b,#12
1800         ext     v21.16b,v21.16b,v21.16b,#12
1801         cbnz    x4,.Loop_lower_neon
1802
1803         add     w5,w5,w22               // accumulate key block
1804         ldp     q24,q25,[sp,#0]
1805         add     x6,x6,x22,lsr#32
1806         ldp     q26,q27,[sp,#32]
1807         add     w7,w7,w23
1808         ldp     q28,q29,[sp,#64]
1809         add     x8,x8,x23,lsr#32
1810         add     v0.4s,v0.4s,v24.4s
1811         add     w9,w9,w24
1812         add     v4.4s,v4.4s,v24.4s
1813         add     x10,x10,x24,lsr#32
1814         add     v8.4s,v8.4s,v24.4s
1815         add     w11,w11,w25
1816         add     v12.4s,v12.4s,v24.4s
1817         add     x12,x12,x25,lsr#32
1818         add     v16.4s,v16.4s,v24.4s
1819         add     w13,w13,w26
1820         add     v20.4s,v20.4s,v24.4s
1821         add     x14,x14,x26,lsr#32
1822         add     v2.4s,v2.4s,v26.4s
1823         add     w15,w15,w27
1824         add     v6.4s,v6.4s,v26.4s
1825         add     x16,x16,x27,lsr#32
1826         add     v10.4s,v10.4s,v26.4s
1827         add     w17,w17,w28
1828         add     v14.4s,v14.4s,v26.4s
1829         add     x19,x19,x28,lsr#32
1830         add     v18.4s,v18.4s,v26.4s
1831         add     w20,w20,w30
1832         add     v22.4s,v22.4s,v26.4s
1833         add     x21,x21,x30,lsr#32
1834         add     v19.4s,v19.4s,v31.4s                    // +4
1835         add     x5,x5,x6,lsl#32 // pack
1836         add     v23.4s,v23.4s,v31.4s                    // +4
1837         add     x7,x7,x8,lsl#32
1838         add     v3.4s,v3.4s,v27.4s
1839         ldp     x6,x8,[x1,#0]           // load input
1840         add     v7.4s,v7.4s,v28.4s
1841         add     x9,x9,x10,lsl#32
1842         add     v11.4s,v11.4s,v29.4s
1843         add     x11,x11,x12,lsl#32
1844         add     v15.4s,v15.4s,v30.4s
1845         ldp     x10,x12,[x1,#16]
1846         add     v19.4s,v19.4s,v27.4s
1847         add     x13,x13,x14,lsl#32
1848         add     v23.4s,v23.4s,v28.4s
1849         add     x15,x15,x16,lsl#32
1850         add     v1.4s,v1.4s,v25.4s
1851         ldp     x14,x16,[x1,#32]
1852         add     v5.4s,v5.4s,v25.4s
1853         add     x17,x17,x19,lsl#32
1854         add     v9.4s,v9.4s,v25.4s
1855         add     x20,x20,x21,lsl#32
1856         add     v13.4s,v13.4s,v25.4s
1857         ldp     x19,x21,[x1,#48]
1858         add     v17.4s,v17.4s,v25.4s
1859         add     x1,x1,#64
1860         add     v21.4s,v21.4s,v25.4s
1861
1862 #ifdef  __ARMEB__
1863         rev     x5,x5
1864         rev     x7,x7
1865         rev     x9,x9
1866         rev     x11,x11
1867         rev     x13,x13
1868         rev     x15,x15
1869         rev     x17,x17
1870         rev     x20,x20
1871 #endif
1872         ld1     {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1873         eor     x5,x5,x6
1874         eor     x7,x7,x8
1875         eor     x9,x9,x10
1876         eor     x11,x11,x12
1877         eor     x13,x13,x14
1878         eor     v0.16b,v0.16b,v24.16b
1879         eor     x15,x15,x16
1880         eor     v1.16b,v1.16b,v25.16b
1881         eor     x17,x17,x19
1882         eor     v2.16b,v2.16b,v26.16b
1883         eor     x20,x20,x21
1884         eor     v3.16b,v3.16b,v27.16b
1885         ld1     {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1886
1887         stp     x5,x7,[x0,#0]           // store output
1888         add     x28,x28,#7                      // increment counter
1889         stp     x9,x11,[x0,#16]
1890         stp     x13,x15,[x0,#32]
1891         stp     x17,x20,[x0,#48]
1892         add     x0,x0,#64
1893         st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1894
1895         ld1     {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1896         eor     v4.16b,v4.16b,v24.16b
1897         eor     v5.16b,v5.16b,v25.16b
1898         eor     v6.16b,v6.16b,v26.16b
1899         eor     v7.16b,v7.16b,v27.16b
1900         st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1901
1902         ld1     {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1903         eor     v8.16b,v8.16b,v0.16b
1904         ldp     q24,q25,[sp,#0]
1905         eor     v9.16b,v9.16b,v1.16b
1906         ldp     q26,q27,[sp,#32]
1907         eor     v10.16b,v10.16b,v2.16b
1908         eor     v11.16b,v11.16b,v3.16b
1909         st1     {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1910
1911         ld1     {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1912         eor     v12.16b,v12.16b,v4.16b
1913         eor     v13.16b,v13.16b,v5.16b
1914         eor     v14.16b,v14.16b,v6.16b
1915         eor     v15.16b,v15.16b,v7.16b
1916         st1     {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1917
1918         ld1     {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1919         eor     v16.16b,v16.16b,v8.16b
1920         eor     v17.16b,v17.16b,v9.16b
1921         eor     v18.16b,v18.16b,v10.16b
1922         eor     v19.16b,v19.16b,v11.16b
1923         st1     {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1924
1925         shl     v0.4s,v31.4s,#1                 // 4 -> 8
1926         eor     v20.16b,v20.16b,v12.16b
1927         eor     v21.16b,v21.16b,v13.16b
1928         eor     v22.16b,v22.16b,v14.16b
1929         eor     v23.16b,v23.16b,v15.16b
1930         st1     {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1931
1932         add     v27.4s,v27.4s,v0.4s                     // += 8
1933         add     v28.4s,v28.4s,v0.4s
1934         add     v29.4s,v29.4s,v0.4s
1935         add     v30.4s,v30.4s,v0.4s
1936
1937         b.hs    .Loop_outer_512_neon
1938
1939         adds    x2,x2,#512
1940         ushr    v0.4s,v31.4s,#2                 // 4 -> 1
1941
1942         ldp     d8,d9,[sp,#128+0]               // meet ABI requirements
1943         ldp     d10,d11,[sp,#128+16]
1944         ldp     d12,d13,[sp,#128+32]
1945         ldp     d14,d15,[sp,#128+48]
1946
1947         stp     q24,q31,[sp,#0]         // wipe off-load area
1948         stp     q24,q31,[sp,#32]
1949         stp     q24,q31,[sp,#64]
1950
1951         b.eq    .Ldone_512_neon
1952
1953         cmp     x2,#192
1954         sub     v27.4s,v27.4s,v0.4s                     // -= 1
1955         sub     v28.4s,v28.4s,v0.4s
1956         sub     v29.4s,v29.4s,v0.4s
1957         add     sp,sp,#128
1958         b.hs    .Loop_outer_neon
1959
1960         eor     v25.16b,v25.16b,v25.16b
1961         eor     v26.16b,v26.16b,v26.16b
1962         eor     v27.16b,v27.16b,v27.16b
1963         eor     v28.16b,v28.16b,v28.16b
1964         eor     v29.16b,v29.16b,v29.16b
1965         eor     v30.16b,v30.16b,v30.16b
1966         b       .Loop_outer
1967
1968 .Ldone_512_neon:
1969         ldp     x19,x20,[x29,#16]
1970         add     sp,sp,#128+64
1971         ldp     x21,x22,[x29,#32]
1972         ldp     x23,x24,[x29,#48]
1973         ldp     x25,x26,[x29,#64]
1974         ldp     x27,x28,[x29,#80]
1975         ldp     x29,x30,[sp],#96
1976 .inst   0xd50323bf                      // autiasp
1977         ret
1978 .size   ChaCha20_512_neon,.-ChaCha20_512_neon