]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/aarch64/aesv8-armx.S
Regen assemply files for aarch64.
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / aarch64 / aesv8-armx.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from aesv8-armx.pl. */
3 #include "arm_arch.h"
4
5 #if __ARM_MAX_ARCH__>=7
6 .text
7 .align  5
8 .Lrcon:
9 .long   0x01,0x01,0x01,0x01
10 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
11 .long   0x1b,0x1b,0x1b,0x1b
12
13 .globl  aes_v8_set_encrypt_key
14 .type   aes_v8_set_encrypt_key,%function
15 .align  5
16 aes_v8_set_encrypt_key:
17 .Lenc_key:
18         stp     x29,x30,[sp,#-16]!
19         add     x29,sp,#0
20         mov     x3,#-1
21         cmp     x0,#0
22         b.eq    .Lenc_key_abort
23         cmp     x2,#0
24         b.eq    .Lenc_key_abort
25         mov     x3,#-2
26         cmp     w1,#128
27         b.lt    .Lenc_key_abort
28         cmp     w1,#256
29         b.gt    .Lenc_key_abort
30         tst     w1,#0x3f
31         b.ne    .Lenc_key_abort
32
33         adr     x3,.Lrcon
34         cmp     w1,#192
35
36         eor     v0.16b,v0.16b,v0.16b
37         ld1     {v3.16b},[x0],#16
38         mov     w1,#8           // reuse w1
39         ld1     {v1.4s,v2.4s},[x3],#32
40
41         b.lt    .Loop128
42         b.eq    .L192
43         b       .L256
44
45 .align  4
46 .Loop128:
47         tbl     v6.16b,{v3.16b},v2.16b
48         ext     v5.16b,v0.16b,v3.16b,#12
49         st1     {v3.4s},[x2],#16
50         aese    v6.16b,v0.16b
51         subs    w1,w1,#1
52
53         eor     v3.16b,v3.16b,v5.16b
54         ext     v5.16b,v0.16b,v5.16b,#12
55         eor     v3.16b,v3.16b,v5.16b
56         ext     v5.16b,v0.16b,v5.16b,#12
57         eor     v6.16b,v6.16b,v1.16b
58         eor     v3.16b,v3.16b,v5.16b
59         shl     v1.16b,v1.16b,#1
60         eor     v3.16b,v3.16b,v6.16b
61         b.ne    .Loop128
62
63         ld1     {v1.4s},[x3]
64
65         tbl     v6.16b,{v3.16b},v2.16b
66         ext     v5.16b,v0.16b,v3.16b,#12
67         st1     {v3.4s},[x2],#16
68         aese    v6.16b,v0.16b
69
70         eor     v3.16b,v3.16b,v5.16b
71         ext     v5.16b,v0.16b,v5.16b,#12
72         eor     v3.16b,v3.16b,v5.16b
73         ext     v5.16b,v0.16b,v5.16b,#12
74         eor     v6.16b,v6.16b,v1.16b
75         eor     v3.16b,v3.16b,v5.16b
76         shl     v1.16b,v1.16b,#1
77         eor     v3.16b,v3.16b,v6.16b
78
79         tbl     v6.16b,{v3.16b},v2.16b
80         ext     v5.16b,v0.16b,v3.16b,#12
81         st1     {v3.4s},[x2],#16
82         aese    v6.16b,v0.16b
83
84         eor     v3.16b,v3.16b,v5.16b
85         ext     v5.16b,v0.16b,v5.16b,#12
86         eor     v3.16b,v3.16b,v5.16b
87         ext     v5.16b,v0.16b,v5.16b,#12
88         eor     v6.16b,v6.16b,v1.16b
89         eor     v3.16b,v3.16b,v5.16b
90         eor     v3.16b,v3.16b,v6.16b
91         st1     {v3.4s},[x2]
92         add     x2,x2,#0x50
93
94         mov     w12,#10
95         b       .Ldone
96
97 .align  4
98 .L192:
99         ld1     {v4.8b},[x0],#8
100         movi    v6.16b,#8                       // borrow v6.16b
101         st1     {v3.4s},[x2],#16
102         sub     v2.16b,v2.16b,v6.16b    // adjust the mask
103
104 .Loop192:
105         tbl     v6.16b,{v4.16b},v2.16b
106         ext     v5.16b,v0.16b,v3.16b,#12
107         st1     {v4.8b},[x2],#8
108         aese    v6.16b,v0.16b
109         subs    w1,w1,#1
110
111         eor     v3.16b,v3.16b,v5.16b
112         ext     v5.16b,v0.16b,v5.16b,#12
113         eor     v3.16b,v3.16b,v5.16b
114         ext     v5.16b,v0.16b,v5.16b,#12
115         eor     v3.16b,v3.16b,v5.16b
116
117         dup     v5.4s,v3.s[3]
118         eor     v5.16b,v5.16b,v4.16b
119         eor     v6.16b,v6.16b,v1.16b
120         ext     v4.16b,v0.16b,v4.16b,#12
121         shl     v1.16b,v1.16b,#1
122         eor     v4.16b,v4.16b,v5.16b
123         eor     v3.16b,v3.16b,v6.16b
124         eor     v4.16b,v4.16b,v6.16b
125         st1     {v3.4s},[x2],#16
126         b.ne    .Loop192
127
128         mov     w12,#12
129         add     x2,x2,#0x20
130         b       .Ldone
131
132 .align  4
133 .L256:
134         ld1     {v4.16b},[x0]
135         mov     w1,#7
136         mov     w12,#14
137         st1     {v3.4s},[x2],#16
138
139 .Loop256:
140         tbl     v6.16b,{v4.16b},v2.16b
141         ext     v5.16b,v0.16b,v3.16b,#12
142         st1     {v4.4s},[x2],#16
143         aese    v6.16b,v0.16b
144         subs    w1,w1,#1
145
146         eor     v3.16b,v3.16b,v5.16b
147         ext     v5.16b,v0.16b,v5.16b,#12
148         eor     v3.16b,v3.16b,v5.16b
149         ext     v5.16b,v0.16b,v5.16b,#12
150         eor     v6.16b,v6.16b,v1.16b
151         eor     v3.16b,v3.16b,v5.16b
152         shl     v1.16b,v1.16b,#1
153         eor     v3.16b,v3.16b,v6.16b
154         st1     {v3.4s},[x2],#16
155         b.eq    .Ldone
156
157         dup     v6.4s,v3.s[3]           // just splat
158         ext     v5.16b,v0.16b,v4.16b,#12
159         aese    v6.16b,v0.16b
160
161         eor     v4.16b,v4.16b,v5.16b
162         ext     v5.16b,v0.16b,v5.16b,#12
163         eor     v4.16b,v4.16b,v5.16b
164         ext     v5.16b,v0.16b,v5.16b,#12
165         eor     v4.16b,v4.16b,v5.16b
166
167         eor     v4.16b,v4.16b,v6.16b
168         b       .Loop256
169
170 .Ldone:
171         str     w12,[x2]
172         mov     x3,#0
173
174 .Lenc_key_abort:
175         mov     x0,x3                   // return value
176         ldr     x29,[sp],#16
177         ret
178 .size   aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
179
180 .globl  aes_v8_set_decrypt_key
181 .type   aes_v8_set_decrypt_key,%function
182 .align  5
183 aes_v8_set_decrypt_key:
184         stp     x29,x30,[sp,#-16]!
185         add     x29,sp,#0
186         bl      .Lenc_key
187
188         cmp     x0,#0
189         b.ne    .Ldec_key_abort
190
191         sub     x2,x2,#240              // restore original x2
192         mov     x4,#-16
193         add     x0,x2,x12,lsl#4 // end of key schedule
194
195         ld1     {v0.4s},[x2]
196         ld1     {v1.4s},[x0]
197         st1     {v0.4s},[x0],x4
198         st1     {v1.4s},[x2],#16
199
200 .Loop_imc:
201         ld1     {v0.4s},[x2]
202         ld1     {v1.4s},[x0]
203         aesimc  v0.16b,v0.16b
204         aesimc  v1.16b,v1.16b
205         st1     {v0.4s},[x0],x4
206         st1     {v1.4s},[x2],#16
207         cmp     x0,x2
208         b.hi    .Loop_imc
209
210         ld1     {v0.4s},[x2]
211         aesimc  v0.16b,v0.16b
212         st1     {v0.4s},[x0]
213
214         eor     x0,x0,x0                // return value
215 .Ldec_key_abort:
216         ldp     x29,x30,[sp],#16
217         ret
218 .size   aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
219 .globl  aes_v8_encrypt
220 .type   aes_v8_encrypt,%function
221 .align  5
222 aes_v8_encrypt:
223         ldr     w3,[x2,#240]
224         ld1     {v0.4s},[x2],#16
225         ld1     {v2.16b},[x0]
226         sub     w3,w3,#2
227         ld1     {v1.4s},[x2],#16
228
229 .Loop_enc:
230         aese    v2.16b,v0.16b
231         aesmc   v2.16b,v2.16b
232         ld1     {v0.4s},[x2],#16
233         subs    w3,w3,#2
234         aese    v2.16b,v1.16b
235         aesmc   v2.16b,v2.16b
236         ld1     {v1.4s},[x2],#16
237         b.gt    .Loop_enc
238
239         aese    v2.16b,v0.16b
240         aesmc   v2.16b,v2.16b
241         ld1     {v0.4s},[x2]
242         aese    v2.16b,v1.16b
243         eor     v2.16b,v2.16b,v0.16b
244
245         st1     {v2.16b},[x1]
246         ret
247 .size   aes_v8_encrypt,.-aes_v8_encrypt
248 .globl  aes_v8_decrypt
249 .type   aes_v8_decrypt,%function
250 .align  5
251 aes_v8_decrypt:
252         ldr     w3,[x2,#240]
253         ld1     {v0.4s},[x2],#16
254         ld1     {v2.16b},[x0]
255         sub     w3,w3,#2
256         ld1     {v1.4s},[x2],#16
257
258 .Loop_dec:
259         aesd    v2.16b,v0.16b
260         aesimc  v2.16b,v2.16b
261         ld1     {v0.4s},[x2],#16
262         subs    w3,w3,#2
263         aesd    v2.16b,v1.16b
264         aesimc  v2.16b,v2.16b
265         ld1     {v1.4s},[x2],#16
266         b.gt    .Loop_dec
267
268         aesd    v2.16b,v0.16b
269         aesimc  v2.16b,v2.16b
270         ld1     {v0.4s},[x2]
271         aesd    v2.16b,v1.16b
272         eor     v2.16b,v2.16b,v0.16b
273
274         st1     {v2.16b},[x1]
275         ret
276 .size   aes_v8_decrypt,.-aes_v8_decrypt
277 .globl  aes_v8_cbc_encrypt
278 .type   aes_v8_cbc_encrypt,%function
279 .align  5
280 aes_v8_cbc_encrypt:
281         stp     x29,x30,[sp,#-16]!
282         add     x29,sp,#0
283         subs    x2,x2,#16
284         mov     x8,#16
285         b.lo    .Lcbc_abort
286         csel    x8,xzr,x8,eq
287
288         cmp     w5,#0                   // en- or decrypting?
289         ldr     w5,[x3,#240]
290         and     x2,x2,#-16
291         ld1     {v6.16b},[x4]
292         ld1     {v0.16b},[x0],x8
293
294         ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
295         sub     w5,w5,#6
296         add     x7,x3,x5,lsl#4  // pointer to last 7 round keys
297         sub     w5,w5,#2
298         ld1     {v18.4s,v19.4s},[x7],#32
299         ld1     {v20.4s,v21.4s},[x7],#32
300         ld1     {v22.4s,v23.4s},[x7],#32
301         ld1     {v7.4s},[x7]
302
303         add     x7,x3,#32
304         mov     w6,w5
305         b.eq    .Lcbc_dec
306
307         cmp     w5,#2
308         eor     v0.16b,v0.16b,v6.16b
309         eor     v5.16b,v16.16b,v7.16b
310         b.eq    .Lcbc_enc128
311
312         ld1     {v2.4s,v3.4s},[x7]
313         add     x7,x3,#16
314         add     x6,x3,#16*4
315         add     x12,x3,#16*5
316         aese    v0.16b,v16.16b
317         aesmc   v0.16b,v0.16b
318         add     x14,x3,#16*6
319         add     x3,x3,#16*7
320         b       .Lenter_cbc_enc
321
322 .align  4
323 .Loop_cbc_enc:
324         aese    v0.16b,v16.16b
325         aesmc   v0.16b,v0.16b
326         st1     {v6.16b},[x1],#16
327 .Lenter_cbc_enc:
328         aese    v0.16b,v17.16b
329         aesmc   v0.16b,v0.16b
330         aese    v0.16b,v2.16b
331         aesmc   v0.16b,v0.16b
332         ld1     {v16.4s},[x6]
333         cmp     w5,#4
334         aese    v0.16b,v3.16b
335         aesmc   v0.16b,v0.16b
336         ld1     {v17.4s},[x12]
337         b.eq    .Lcbc_enc192
338
339         aese    v0.16b,v16.16b
340         aesmc   v0.16b,v0.16b
341         ld1     {v16.4s},[x14]
342         aese    v0.16b,v17.16b
343         aesmc   v0.16b,v0.16b
344         ld1     {v17.4s},[x3]
345         nop
346
347 .Lcbc_enc192:
348         aese    v0.16b,v16.16b
349         aesmc   v0.16b,v0.16b
350         subs    x2,x2,#16
351         aese    v0.16b,v17.16b
352         aesmc   v0.16b,v0.16b
353         csel    x8,xzr,x8,eq
354         aese    v0.16b,v18.16b
355         aesmc   v0.16b,v0.16b
356         aese    v0.16b,v19.16b
357         aesmc   v0.16b,v0.16b
358         ld1     {v16.16b},[x0],x8
359         aese    v0.16b,v20.16b
360         aesmc   v0.16b,v0.16b
361         eor     v16.16b,v16.16b,v5.16b
362         aese    v0.16b,v21.16b
363         aesmc   v0.16b,v0.16b
364         ld1     {v17.4s},[x7]           // re-pre-load rndkey[1]
365         aese    v0.16b,v22.16b
366         aesmc   v0.16b,v0.16b
367         aese    v0.16b,v23.16b
368         eor     v6.16b,v0.16b,v7.16b
369         b.hs    .Loop_cbc_enc
370
371         st1     {v6.16b},[x1],#16
372         b       .Lcbc_done
373
374 .align  5
375 .Lcbc_enc128:
376         ld1     {v2.4s,v3.4s},[x7]
377         aese    v0.16b,v16.16b
378         aesmc   v0.16b,v0.16b
379         b       .Lenter_cbc_enc128
380 .Loop_cbc_enc128:
381         aese    v0.16b,v16.16b
382         aesmc   v0.16b,v0.16b
383         st1     {v6.16b},[x1],#16
384 .Lenter_cbc_enc128:
385         aese    v0.16b,v17.16b
386         aesmc   v0.16b,v0.16b
387         subs    x2,x2,#16
388         aese    v0.16b,v2.16b
389         aesmc   v0.16b,v0.16b
390         csel    x8,xzr,x8,eq
391         aese    v0.16b,v3.16b
392         aesmc   v0.16b,v0.16b
393         aese    v0.16b,v18.16b
394         aesmc   v0.16b,v0.16b
395         aese    v0.16b,v19.16b
396         aesmc   v0.16b,v0.16b
397         ld1     {v16.16b},[x0],x8
398         aese    v0.16b,v20.16b
399         aesmc   v0.16b,v0.16b
400         aese    v0.16b,v21.16b
401         aesmc   v0.16b,v0.16b
402         aese    v0.16b,v22.16b
403         aesmc   v0.16b,v0.16b
404         eor     v16.16b,v16.16b,v5.16b
405         aese    v0.16b,v23.16b
406         eor     v6.16b,v0.16b,v7.16b
407         b.hs    .Loop_cbc_enc128
408
409         st1     {v6.16b},[x1],#16
410         b       .Lcbc_done
411 .align  5
412 .Lcbc_dec:
413         ld1     {v18.16b},[x0],#16
414         subs    x2,x2,#32               // bias
415         add     w6,w5,#2
416         orr     v3.16b,v0.16b,v0.16b
417         orr     v1.16b,v0.16b,v0.16b
418         orr     v19.16b,v18.16b,v18.16b
419         b.lo    .Lcbc_dec_tail
420
421         orr     v1.16b,v18.16b,v18.16b
422         ld1     {v18.16b},[x0],#16
423         orr     v2.16b,v0.16b,v0.16b
424         orr     v3.16b,v1.16b,v1.16b
425         orr     v19.16b,v18.16b,v18.16b
426
427 .Loop3x_cbc_dec:
428         aesd    v0.16b,v16.16b
429         aesimc  v0.16b,v0.16b
430         aesd    v1.16b,v16.16b
431         aesimc  v1.16b,v1.16b
432         aesd    v18.16b,v16.16b
433         aesimc  v18.16b,v18.16b
434         ld1     {v16.4s},[x7],#16
435         subs    w6,w6,#2
436         aesd    v0.16b,v17.16b
437         aesimc  v0.16b,v0.16b
438         aesd    v1.16b,v17.16b
439         aesimc  v1.16b,v1.16b
440         aesd    v18.16b,v17.16b
441         aesimc  v18.16b,v18.16b
442         ld1     {v17.4s},[x7],#16
443         b.gt    .Loop3x_cbc_dec
444
445         aesd    v0.16b,v16.16b
446         aesimc  v0.16b,v0.16b
447         aesd    v1.16b,v16.16b
448         aesimc  v1.16b,v1.16b
449         aesd    v18.16b,v16.16b
450         aesimc  v18.16b,v18.16b
451         eor     v4.16b,v6.16b,v7.16b
452         subs    x2,x2,#0x30
453         eor     v5.16b,v2.16b,v7.16b
454         csel    x6,x2,x6,lo                     // x6, w6, is zero at this point
455         aesd    v0.16b,v17.16b
456         aesimc  v0.16b,v0.16b
457         aesd    v1.16b,v17.16b
458         aesimc  v1.16b,v1.16b
459         aesd    v18.16b,v17.16b
460         aesimc  v18.16b,v18.16b
461         eor     v17.16b,v3.16b,v7.16b
462         add     x0,x0,x6                // x0 is adjusted in such way that
463                                         // at exit from the loop v1.16b-v18.16b
464                                         // are loaded with last "words"
465         orr     v6.16b,v19.16b,v19.16b
466         mov     x7,x3
467         aesd    v0.16b,v20.16b
468         aesimc  v0.16b,v0.16b
469         aesd    v1.16b,v20.16b
470         aesimc  v1.16b,v1.16b
471         aesd    v18.16b,v20.16b
472         aesimc  v18.16b,v18.16b
473         ld1     {v2.16b},[x0],#16
474         aesd    v0.16b,v21.16b
475         aesimc  v0.16b,v0.16b
476         aesd    v1.16b,v21.16b
477         aesimc  v1.16b,v1.16b
478         aesd    v18.16b,v21.16b
479         aesimc  v18.16b,v18.16b
480         ld1     {v3.16b},[x0],#16
481         aesd    v0.16b,v22.16b
482         aesimc  v0.16b,v0.16b
483         aesd    v1.16b,v22.16b
484         aesimc  v1.16b,v1.16b
485         aesd    v18.16b,v22.16b
486         aesimc  v18.16b,v18.16b
487         ld1     {v19.16b},[x0],#16
488         aesd    v0.16b,v23.16b
489         aesd    v1.16b,v23.16b
490         aesd    v18.16b,v23.16b
491         ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
492         add     w6,w5,#2
493         eor     v4.16b,v4.16b,v0.16b
494         eor     v5.16b,v5.16b,v1.16b
495         eor     v18.16b,v18.16b,v17.16b
496         ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
497         st1     {v4.16b},[x1],#16
498         orr     v0.16b,v2.16b,v2.16b
499         st1     {v5.16b},[x1],#16
500         orr     v1.16b,v3.16b,v3.16b
501         st1     {v18.16b},[x1],#16
502         orr     v18.16b,v19.16b,v19.16b
503         b.hs    .Loop3x_cbc_dec
504
505         cmn     x2,#0x30
506         b.eq    .Lcbc_done
507         nop
508
509 .Lcbc_dec_tail:
510         aesd    v1.16b,v16.16b
511         aesimc  v1.16b,v1.16b
512         aesd    v18.16b,v16.16b
513         aesimc  v18.16b,v18.16b
514         ld1     {v16.4s},[x7],#16
515         subs    w6,w6,#2
516         aesd    v1.16b,v17.16b
517         aesimc  v1.16b,v1.16b
518         aesd    v18.16b,v17.16b
519         aesimc  v18.16b,v18.16b
520         ld1     {v17.4s},[x7],#16
521         b.gt    .Lcbc_dec_tail
522
523         aesd    v1.16b,v16.16b
524         aesimc  v1.16b,v1.16b
525         aesd    v18.16b,v16.16b
526         aesimc  v18.16b,v18.16b
527         aesd    v1.16b,v17.16b
528         aesimc  v1.16b,v1.16b
529         aesd    v18.16b,v17.16b
530         aesimc  v18.16b,v18.16b
531         aesd    v1.16b,v20.16b
532         aesimc  v1.16b,v1.16b
533         aesd    v18.16b,v20.16b
534         aesimc  v18.16b,v18.16b
535         cmn     x2,#0x20
536         aesd    v1.16b,v21.16b
537         aesimc  v1.16b,v1.16b
538         aesd    v18.16b,v21.16b
539         aesimc  v18.16b,v18.16b
540         eor     v5.16b,v6.16b,v7.16b
541         aesd    v1.16b,v22.16b
542         aesimc  v1.16b,v1.16b
543         aesd    v18.16b,v22.16b
544         aesimc  v18.16b,v18.16b
545         eor     v17.16b,v3.16b,v7.16b
546         aesd    v1.16b,v23.16b
547         aesd    v18.16b,v23.16b
548         b.eq    .Lcbc_dec_one
549         eor     v5.16b,v5.16b,v1.16b
550         eor     v17.16b,v17.16b,v18.16b
551         orr     v6.16b,v19.16b,v19.16b
552         st1     {v5.16b},[x1],#16
553         st1     {v17.16b},[x1],#16
554         b       .Lcbc_done
555
556 .Lcbc_dec_one:
557         eor     v5.16b,v5.16b,v18.16b
558         orr     v6.16b,v19.16b,v19.16b
559         st1     {v5.16b},[x1],#16
560
561 .Lcbc_done:
562         st1     {v6.16b},[x4]
563 .Lcbc_abort:
564         ldr     x29,[sp],#16
565         ret
566 .size   aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
567 .globl  aes_v8_ctr32_encrypt_blocks
568 .type   aes_v8_ctr32_encrypt_blocks,%function
569 .align  5
570 aes_v8_ctr32_encrypt_blocks:
571         stp     x29,x30,[sp,#-16]!
572         add     x29,sp,#0
573         ldr     w5,[x3,#240]
574
575         ldr     w8, [x4, #12]
576         ld1     {v0.4s},[x4]
577
578         ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
579         sub     w5,w5,#4
580         mov     x12,#16
581         cmp     x2,#2
582         add     x7,x3,x5,lsl#4  // pointer to last 5 round keys
583         sub     w5,w5,#2
584         ld1     {v20.4s,v21.4s},[x7],#32
585         ld1     {v22.4s,v23.4s},[x7],#32
586         ld1     {v7.4s},[x7]
587         add     x7,x3,#32
588         mov     w6,w5
589         csel    x12,xzr,x12,lo
590 #ifndef __ARMEB__
591         rev     w8, w8
592 #endif
593         orr     v1.16b,v0.16b,v0.16b
594         add     w10, w8, #1
595         orr     v18.16b,v0.16b,v0.16b
596         add     w8, w8, #2
597         orr     v6.16b,v0.16b,v0.16b
598         rev     w10, w10
599         mov     v1.s[3],w10
600         b.ls    .Lctr32_tail
601         rev     w12, w8
602         sub     x2,x2,#3                // bias
603         mov     v18.s[3],w12
604         b       .Loop3x_ctr32
605
606 .align  4
607 .Loop3x_ctr32:
608         aese    v0.16b,v16.16b
609         aesmc   v0.16b,v0.16b
610         aese    v1.16b,v16.16b
611         aesmc   v1.16b,v1.16b
612         aese    v18.16b,v16.16b
613         aesmc   v18.16b,v18.16b
614         ld1     {v16.4s},[x7],#16
615         subs    w6,w6,#2
616         aese    v0.16b,v17.16b
617         aesmc   v0.16b,v0.16b
618         aese    v1.16b,v17.16b
619         aesmc   v1.16b,v1.16b
620         aese    v18.16b,v17.16b
621         aesmc   v18.16b,v18.16b
622         ld1     {v17.4s},[x7],#16
623         b.gt    .Loop3x_ctr32
624
625         aese    v0.16b,v16.16b
626         aesmc   v4.16b,v0.16b
627         aese    v1.16b,v16.16b
628         aesmc   v5.16b,v1.16b
629         ld1     {v2.16b},[x0],#16
630         orr     v0.16b,v6.16b,v6.16b
631         aese    v18.16b,v16.16b
632         aesmc   v18.16b,v18.16b
633         ld1     {v3.16b},[x0],#16
634         orr     v1.16b,v6.16b,v6.16b
635         aese    v4.16b,v17.16b
636         aesmc   v4.16b,v4.16b
637         aese    v5.16b,v17.16b
638         aesmc   v5.16b,v5.16b
639         ld1     {v19.16b},[x0],#16
640         mov     x7,x3
641         aese    v18.16b,v17.16b
642         aesmc   v17.16b,v18.16b
643         orr     v18.16b,v6.16b,v6.16b
644         add     w9,w8,#1
645         aese    v4.16b,v20.16b
646         aesmc   v4.16b,v4.16b
647         aese    v5.16b,v20.16b
648         aesmc   v5.16b,v5.16b
649         eor     v2.16b,v2.16b,v7.16b
650         add     w10,w8,#2
651         aese    v17.16b,v20.16b
652         aesmc   v17.16b,v17.16b
653         eor     v3.16b,v3.16b,v7.16b
654         add     w8,w8,#3
655         aese    v4.16b,v21.16b
656         aesmc   v4.16b,v4.16b
657         aese    v5.16b,v21.16b
658         aesmc   v5.16b,v5.16b
659         eor     v19.16b,v19.16b,v7.16b
660         rev     w9,w9
661         aese    v17.16b,v21.16b
662         aesmc   v17.16b,v17.16b
663         mov     v0.s[3], w9
664         rev     w10,w10
665         aese    v4.16b,v22.16b
666         aesmc   v4.16b,v4.16b
667         aese    v5.16b,v22.16b
668         aesmc   v5.16b,v5.16b
669         mov     v1.s[3], w10
670         rev     w12,w8
671         aese    v17.16b,v22.16b
672         aesmc   v17.16b,v17.16b
673         mov     v18.s[3], w12
674         subs    x2,x2,#3
675         aese    v4.16b,v23.16b
676         aese    v5.16b,v23.16b
677         aese    v17.16b,v23.16b
678
679         eor     v2.16b,v2.16b,v4.16b
680         ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
681         st1     {v2.16b},[x1],#16
682         eor     v3.16b,v3.16b,v5.16b
683         mov     w6,w5
684         st1     {v3.16b},[x1],#16
685         eor     v19.16b,v19.16b,v17.16b
686         ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
687         st1     {v19.16b},[x1],#16
688         b.hs    .Loop3x_ctr32
689
690         adds    x2,x2,#3
691         b.eq    .Lctr32_done
692         cmp     x2,#1
693         mov     x12,#16
694         csel    x12,xzr,x12,eq
695
696 .Lctr32_tail:
697         aese    v0.16b,v16.16b
698         aesmc   v0.16b,v0.16b
699         aese    v1.16b,v16.16b
700         aesmc   v1.16b,v1.16b
701         ld1     {v16.4s},[x7],#16
702         subs    w6,w6,#2
703         aese    v0.16b,v17.16b
704         aesmc   v0.16b,v0.16b
705         aese    v1.16b,v17.16b
706         aesmc   v1.16b,v1.16b
707         ld1     {v17.4s},[x7],#16
708         b.gt    .Lctr32_tail
709
710         aese    v0.16b,v16.16b
711         aesmc   v0.16b,v0.16b
712         aese    v1.16b,v16.16b
713         aesmc   v1.16b,v1.16b
714         aese    v0.16b,v17.16b
715         aesmc   v0.16b,v0.16b
716         aese    v1.16b,v17.16b
717         aesmc   v1.16b,v1.16b
718         ld1     {v2.16b},[x0],x12
719         aese    v0.16b,v20.16b
720         aesmc   v0.16b,v0.16b
721         aese    v1.16b,v20.16b
722         aesmc   v1.16b,v1.16b
723         ld1     {v3.16b},[x0]
724         aese    v0.16b,v21.16b
725         aesmc   v0.16b,v0.16b
726         aese    v1.16b,v21.16b
727         aesmc   v1.16b,v1.16b
728         eor     v2.16b,v2.16b,v7.16b
729         aese    v0.16b,v22.16b
730         aesmc   v0.16b,v0.16b
731         aese    v1.16b,v22.16b
732         aesmc   v1.16b,v1.16b
733         eor     v3.16b,v3.16b,v7.16b
734         aese    v0.16b,v23.16b
735         aese    v1.16b,v23.16b
736
737         cmp     x2,#1
738         eor     v2.16b,v2.16b,v0.16b
739         eor     v3.16b,v3.16b,v1.16b
740         st1     {v2.16b},[x1],#16
741         b.eq    .Lctr32_done
742         st1     {v3.16b},[x1]
743
744 .Lctr32_done:
745         ldr     x29,[sp],#16
746         ret
747 .size   aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
748 #endif