]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/openssl/aarch64/aesv8-armx.S
Use a template assembly file to generate the embedded MFS.
[FreeBSD/FreeBSD.git] / sys / crypto / openssl / aarch64 / aesv8-armx.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from aesv8-armx.pl. */
3 #include "arm_arch.h"
4
5 #if __ARM_MAX_ARCH__>=7
6 .text
7 .align  5
8 .Lrcon:
9 .long   0x01,0x01,0x01,0x01
10 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
11 .long   0x1b,0x1b,0x1b,0x1b
12
13 .globl  aes_v8_set_encrypt_key
14 .type   aes_v8_set_encrypt_key,%function
15 .align  5
16 aes_v8_set_encrypt_key:
17 .Lenc_key:
18         stp     x29,x30,[sp,#-16]!
19         add     x29,sp,#0
20         mov     x3,#-1
21         cmp     x0,#0
22         b.eq    .Lenc_key_abort
23         cmp     x2,#0
24         b.eq    .Lenc_key_abort
25         mov     x3,#-2
26         cmp     w1,#128
27         b.lt    .Lenc_key_abort
28         cmp     w1,#256
29         b.gt    .Lenc_key_abort
30         tst     w1,#0x3f
31         b.ne    .Lenc_key_abort
32
33         adr     x3,.Lrcon
34         cmp     w1,#192
35
36         eor     v0.16b,v0.16b,v0.16b
37         ld1     {v3.16b},[x0],#16
38         mov     w1,#8           // reuse w1
39         ld1     {v1.4s,v2.4s},[x3],#32
40
41         b.lt    .Loop128
42         b.eq    .L192
43         b       .L256
44
45 .align  4
46 .Loop128:
47         tbl     v6.16b,{v3.16b},v2.16b
48         ext     v5.16b,v0.16b,v3.16b,#12
49         st1     {v3.4s},[x2],#16
50         aese    v6.16b,v0.16b
51         subs    w1,w1,#1
52
53         eor     v3.16b,v3.16b,v5.16b
54         ext     v5.16b,v0.16b,v5.16b,#12
55         eor     v3.16b,v3.16b,v5.16b
56         ext     v5.16b,v0.16b,v5.16b,#12
57         eor     v6.16b,v6.16b,v1.16b
58         eor     v3.16b,v3.16b,v5.16b
59         shl     v1.16b,v1.16b,#1
60         eor     v3.16b,v3.16b,v6.16b
61         b.ne    .Loop128
62
63         ld1     {v1.4s},[x3]
64
65         tbl     v6.16b,{v3.16b},v2.16b
66         ext     v5.16b,v0.16b,v3.16b,#12
67         st1     {v3.4s},[x2],#16
68         aese    v6.16b,v0.16b
69
70         eor     v3.16b,v3.16b,v5.16b
71         ext     v5.16b,v0.16b,v5.16b,#12
72         eor     v3.16b,v3.16b,v5.16b
73         ext     v5.16b,v0.16b,v5.16b,#12
74         eor     v6.16b,v6.16b,v1.16b
75         eor     v3.16b,v3.16b,v5.16b
76         shl     v1.16b,v1.16b,#1
77         eor     v3.16b,v3.16b,v6.16b
78
79         tbl     v6.16b,{v3.16b},v2.16b
80         ext     v5.16b,v0.16b,v3.16b,#12
81         st1     {v3.4s},[x2],#16
82         aese    v6.16b,v0.16b
83
84         eor     v3.16b,v3.16b,v5.16b
85         ext     v5.16b,v0.16b,v5.16b,#12
86         eor     v3.16b,v3.16b,v5.16b
87         ext     v5.16b,v0.16b,v5.16b,#12
88         eor     v6.16b,v6.16b,v1.16b
89         eor     v3.16b,v3.16b,v5.16b
90         eor     v3.16b,v3.16b,v6.16b
91         st1     {v3.4s},[x2]
92         add     x2,x2,#0x50
93
94         mov     w12,#10
95         b       .Ldone
96
97 .align  4
98 .L192:
99         ld1     {v4.8b},[x0],#8
100         movi    v6.16b,#8                       // borrow v6.16b
101         st1     {v3.4s},[x2],#16
102         sub     v2.16b,v2.16b,v6.16b    // adjust the mask
103
104 .Loop192:
105         tbl     v6.16b,{v4.16b},v2.16b
106         ext     v5.16b,v0.16b,v3.16b,#12
107         st1     {v4.8b},[x2],#8
108         aese    v6.16b,v0.16b
109         subs    w1,w1,#1
110
111         eor     v3.16b,v3.16b,v5.16b
112         ext     v5.16b,v0.16b,v5.16b,#12
113         eor     v3.16b,v3.16b,v5.16b
114         ext     v5.16b,v0.16b,v5.16b,#12
115         eor     v3.16b,v3.16b,v5.16b
116
117         dup     v5.4s,v3.s[3]
118         eor     v5.16b,v5.16b,v4.16b
119         eor     v6.16b,v6.16b,v1.16b
120         ext     v4.16b,v0.16b,v4.16b,#12
121         shl     v1.16b,v1.16b,#1
122         eor     v4.16b,v4.16b,v5.16b
123         eor     v3.16b,v3.16b,v6.16b
124         eor     v4.16b,v4.16b,v6.16b
125         st1     {v3.4s},[x2],#16
126         b.ne    .Loop192
127
128         mov     w12,#12
129         add     x2,x2,#0x20
130         b       .Ldone
131
132 .align  4
133 .L256:
134         ld1     {v4.16b},[x0]
135         mov     w1,#7
136         mov     w12,#14
137         st1     {v3.4s},[x2],#16
138
139 .Loop256:
140         tbl     v6.16b,{v4.16b},v2.16b
141         ext     v5.16b,v0.16b,v3.16b,#12
142         st1     {v4.4s},[x2],#16
143         aese    v6.16b,v0.16b
144         subs    w1,w1,#1
145
146         eor     v3.16b,v3.16b,v5.16b
147         ext     v5.16b,v0.16b,v5.16b,#12
148         eor     v3.16b,v3.16b,v5.16b
149         ext     v5.16b,v0.16b,v5.16b,#12
150         eor     v6.16b,v6.16b,v1.16b
151         eor     v3.16b,v3.16b,v5.16b
152         shl     v1.16b,v1.16b,#1
153         eor     v3.16b,v3.16b,v6.16b
154         st1     {v3.4s},[x2],#16
155         b.eq    .Ldone
156
157         dup     v6.4s,v3.s[3]           // just splat
158         ext     v5.16b,v0.16b,v4.16b,#12
159         aese    v6.16b,v0.16b
160
161         eor     v4.16b,v4.16b,v5.16b
162         ext     v5.16b,v0.16b,v5.16b,#12
163         eor     v4.16b,v4.16b,v5.16b
164         ext     v5.16b,v0.16b,v5.16b,#12
165         eor     v4.16b,v4.16b,v5.16b
166
167         eor     v4.16b,v4.16b,v6.16b
168         b       .Loop256
169
170 .Ldone:
171         str     w12,[x2]
172         mov     x3,#0
173
174 .Lenc_key_abort:
175         mov     x0,x3                   // return value
176         ldr     x29,[sp],#16
177         ret
178 .size   aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
179
180 .globl  aes_v8_set_decrypt_key
181 .type   aes_v8_set_decrypt_key,%function
182 .align  5
183 aes_v8_set_decrypt_key:
184 .inst   0xd503233f              // paciasp
185         stp     x29,x30,[sp,#-16]!
186         add     x29,sp,#0
187         bl      .Lenc_key
188
189         cmp     x0,#0
190         b.ne    .Ldec_key_abort
191
192         sub     x2,x2,#240              // restore original x2
193         mov     x4,#-16
194         add     x0,x2,x12,lsl#4 // end of key schedule
195
196         ld1     {v0.4s},[x2]
197         ld1     {v1.4s},[x0]
198         st1     {v0.4s},[x0],x4
199         st1     {v1.4s},[x2],#16
200
201 .Loop_imc:
202         ld1     {v0.4s},[x2]
203         ld1     {v1.4s},[x0]
204         aesimc  v0.16b,v0.16b
205         aesimc  v1.16b,v1.16b
206         st1     {v0.4s},[x0],x4
207         st1     {v1.4s},[x2],#16
208         cmp     x0,x2
209         b.hi    .Loop_imc
210
211         ld1     {v0.4s},[x2]
212         aesimc  v0.16b,v0.16b
213         st1     {v0.4s},[x0]
214
215         eor     x0,x0,x0                // return value
216 .Ldec_key_abort:
217         ldp     x29,x30,[sp],#16
218 .inst   0xd50323bf              // autiasp
219         ret
220 .size   aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
221 .globl  aes_v8_encrypt
222 .type   aes_v8_encrypt,%function
223 .align  5
224 aes_v8_encrypt:
225         ldr     w3,[x2,#240]
226         ld1     {v0.4s},[x2],#16
227         ld1     {v2.16b},[x0]
228         sub     w3,w3,#2
229         ld1     {v1.4s},[x2],#16
230
231 .Loop_enc:
232         aese    v2.16b,v0.16b
233         aesmc   v2.16b,v2.16b
234         ld1     {v0.4s},[x2],#16
235         subs    w3,w3,#2
236         aese    v2.16b,v1.16b
237         aesmc   v2.16b,v2.16b
238         ld1     {v1.4s},[x2],#16
239         b.gt    .Loop_enc
240
241         aese    v2.16b,v0.16b
242         aesmc   v2.16b,v2.16b
243         ld1     {v0.4s},[x2]
244         aese    v2.16b,v1.16b
245         eor     v2.16b,v2.16b,v0.16b
246
247         st1     {v2.16b},[x1]
248         ret
249 .size   aes_v8_encrypt,.-aes_v8_encrypt
250 .globl  aes_v8_decrypt
251 .type   aes_v8_decrypt,%function
252 .align  5
253 aes_v8_decrypt:
254         ldr     w3,[x2,#240]
255         ld1     {v0.4s},[x2],#16
256         ld1     {v2.16b},[x0]
257         sub     w3,w3,#2
258         ld1     {v1.4s},[x2],#16
259
260 .Loop_dec:
261         aesd    v2.16b,v0.16b
262         aesimc  v2.16b,v2.16b
263         ld1     {v0.4s},[x2],#16
264         subs    w3,w3,#2
265         aesd    v2.16b,v1.16b
266         aesimc  v2.16b,v2.16b
267         ld1     {v1.4s},[x2],#16
268         b.gt    .Loop_dec
269
270         aesd    v2.16b,v0.16b
271         aesimc  v2.16b,v2.16b
272         ld1     {v0.4s},[x2]
273         aesd    v2.16b,v1.16b
274         eor     v2.16b,v2.16b,v0.16b
275
276         st1     {v2.16b},[x1]
277         ret
278 .size   aes_v8_decrypt,.-aes_v8_decrypt
279 .globl  aes_v8_cbc_encrypt
280 .type   aes_v8_cbc_encrypt,%function
281 .align  5
282 aes_v8_cbc_encrypt:
283         stp     x29,x30,[sp,#-16]!
284         add     x29,sp,#0
285         subs    x2,x2,#16
286         mov     x8,#16
287         b.lo    .Lcbc_abort
288         csel    x8,xzr,x8,eq
289
290         cmp     w5,#0                   // en- or decrypting?
291         ldr     w5,[x3,#240]
292         and     x2,x2,#-16
293         ld1     {v6.16b},[x4]
294         ld1     {v0.16b},[x0],x8
295
296         ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
297         sub     w5,w5,#6
298         add     x7,x3,x5,lsl#4  // pointer to last 7 round keys
299         sub     w5,w5,#2
300         ld1     {v18.4s,v19.4s},[x7],#32
301         ld1     {v20.4s,v21.4s},[x7],#32
302         ld1     {v22.4s,v23.4s},[x7],#32
303         ld1     {v7.4s},[x7]
304
305         add     x7,x3,#32
306         mov     w6,w5
307         b.eq    .Lcbc_dec
308
309         cmp     w5,#2
310         eor     v0.16b,v0.16b,v6.16b
311         eor     v5.16b,v16.16b,v7.16b
312         b.eq    .Lcbc_enc128
313
314         ld1     {v2.4s,v3.4s},[x7]
315         add     x7,x3,#16
316         add     x6,x3,#16*4
317         add     x12,x3,#16*5
318         aese    v0.16b,v16.16b
319         aesmc   v0.16b,v0.16b
320         add     x14,x3,#16*6
321         add     x3,x3,#16*7
322         b       .Lenter_cbc_enc
323
324 .align  4
325 .Loop_cbc_enc:
326         aese    v0.16b,v16.16b
327         aesmc   v0.16b,v0.16b
328         st1     {v6.16b},[x1],#16
329 .Lenter_cbc_enc:
330         aese    v0.16b,v17.16b
331         aesmc   v0.16b,v0.16b
332         aese    v0.16b,v2.16b
333         aesmc   v0.16b,v0.16b
334         ld1     {v16.4s},[x6]
335         cmp     w5,#4
336         aese    v0.16b,v3.16b
337         aesmc   v0.16b,v0.16b
338         ld1     {v17.4s},[x12]
339         b.eq    .Lcbc_enc192
340
341         aese    v0.16b,v16.16b
342         aesmc   v0.16b,v0.16b
343         ld1     {v16.4s},[x14]
344         aese    v0.16b,v17.16b
345         aesmc   v0.16b,v0.16b
346         ld1     {v17.4s},[x3]
347         nop
348
349 .Lcbc_enc192:
350         aese    v0.16b,v16.16b
351         aesmc   v0.16b,v0.16b
352         subs    x2,x2,#16
353         aese    v0.16b,v17.16b
354         aesmc   v0.16b,v0.16b
355         csel    x8,xzr,x8,eq
356         aese    v0.16b,v18.16b
357         aesmc   v0.16b,v0.16b
358         aese    v0.16b,v19.16b
359         aesmc   v0.16b,v0.16b
360         ld1     {v16.16b},[x0],x8
361         aese    v0.16b,v20.16b
362         aesmc   v0.16b,v0.16b
363         eor     v16.16b,v16.16b,v5.16b
364         aese    v0.16b,v21.16b
365         aesmc   v0.16b,v0.16b
366         ld1     {v17.4s},[x7]           // re-pre-load rndkey[1]
367         aese    v0.16b,v22.16b
368         aesmc   v0.16b,v0.16b
369         aese    v0.16b,v23.16b
370         eor     v6.16b,v0.16b,v7.16b
371         b.hs    .Loop_cbc_enc
372
373         st1     {v6.16b},[x1],#16
374         b       .Lcbc_done
375
376 .align  5
377 .Lcbc_enc128:
378         ld1     {v2.4s,v3.4s},[x7]
379         aese    v0.16b,v16.16b
380         aesmc   v0.16b,v0.16b
381         b       .Lenter_cbc_enc128
382 .Loop_cbc_enc128:
383         aese    v0.16b,v16.16b
384         aesmc   v0.16b,v0.16b
385         st1     {v6.16b},[x1],#16
386 .Lenter_cbc_enc128:
387         aese    v0.16b,v17.16b
388         aesmc   v0.16b,v0.16b
389         subs    x2,x2,#16
390         aese    v0.16b,v2.16b
391         aesmc   v0.16b,v0.16b
392         csel    x8,xzr,x8,eq
393         aese    v0.16b,v3.16b
394         aesmc   v0.16b,v0.16b
395         aese    v0.16b,v18.16b
396         aesmc   v0.16b,v0.16b
397         aese    v0.16b,v19.16b
398         aesmc   v0.16b,v0.16b
399         ld1     {v16.16b},[x0],x8
400         aese    v0.16b,v20.16b
401         aesmc   v0.16b,v0.16b
402         aese    v0.16b,v21.16b
403         aesmc   v0.16b,v0.16b
404         aese    v0.16b,v22.16b
405         aesmc   v0.16b,v0.16b
406         eor     v16.16b,v16.16b,v5.16b
407         aese    v0.16b,v23.16b
408         eor     v6.16b,v0.16b,v7.16b
409         b.hs    .Loop_cbc_enc128
410
411         st1     {v6.16b},[x1],#16
412         b       .Lcbc_done
413 .align  5
414 .Lcbc_dec:
415         ld1     {v18.16b},[x0],#16
416         subs    x2,x2,#32               // bias
417         add     w6,w5,#2
418         orr     v3.16b,v0.16b,v0.16b
419         orr     v1.16b,v0.16b,v0.16b
420         orr     v19.16b,v18.16b,v18.16b
421         b.lo    .Lcbc_dec_tail
422
423         orr     v1.16b,v18.16b,v18.16b
424         ld1     {v18.16b},[x0],#16
425         orr     v2.16b,v0.16b,v0.16b
426         orr     v3.16b,v1.16b,v1.16b
427         orr     v19.16b,v18.16b,v18.16b
428
429 .Loop3x_cbc_dec:
430         aesd    v0.16b,v16.16b
431         aesimc  v0.16b,v0.16b
432         aesd    v1.16b,v16.16b
433         aesimc  v1.16b,v1.16b
434         aesd    v18.16b,v16.16b
435         aesimc  v18.16b,v18.16b
436         ld1     {v16.4s},[x7],#16
437         subs    w6,w6,#2
438         aesd    v0.16b,v17.16b
439         aesimc  v0.16b,v0.16b
440         aesd    v1.16b,v17.16b
441         aesimc  v1.16b,v1.16b
442         aesd    v18.16b,v17.16b
443         aesimc  v18.16b,v18.16b
444         ld1     {v17.4s},[x7],#16
445         b.gt    .Loop3x_cbc_dec
446
447         aesd    v0.16b,v16.16b
448         aesimc  v0.16b,v0.16b
449         aesd    v1.16b,v16.16b
450         aesimc  v1.16b,v1.16b
451         aesd    v18.16b,v16.16b
452         aesimc  v18.16b,v18.16b
453         eor     v4.16b,v6.16b,v7.16b
454         subs    x2,x2,#0x30
455         eor     v5.16b,v2.16b,v7.16b
456         csel    x6,x2,x6,lo                     // x6, w6, is zero at this point
457         aesd    v0.16b,v17.16b
458         aesimc  v0.16b,v0.16b
459         aesd    v1.16b,v17.16b
460         aesimc  v1.16b,v1.16b
461         aesd    v18.16b,v17.16b
462         aesimc  v18.16b,v18.16b
463         eor     v17.16b,v3.16b,v7.16b
464         add     x0,x0,x6                // x0 is adjusted in such way that
465                                         // at exit from the loop v1.16b-v18.16b
466                                         // are loaded with last "words"
467         orr     v6.16b,v19.16b,v19.16b
468         mov     x7,x3
469         aesd    v0.16b,v20.16b
470         aesimc  v0.16b,v0.16b
471         aesd    v1.16b,v20.16b
472         aesimc  v1.16b,v1.16b
473         aesd    v18.16b,v20.16b
474         aesimc  v18.16b,v18.16b
475         ld1     {v2.16b},[x0],#16
476         aesd    v0.16b,v21.16b
477         aesimc  v0.16b,v0.16b
478         aesd    v1.16b,v21.16b
479         aesimc  v1.16b,v1.16b
480         aesd    v18.16b,v21.16b
481         aesimc  v18.16b,v18.16b
482         ld1     {v3.16b},[x0],#16
483         aesd    v0.16b,v22.16b
484         aesimc  v0.16b,v0.16b
485         aesd    v1.16b,v22.16b
486         aesimc  v1.16b,v1.16b
487         aesd    v18.16b,v22.16b
488         aesimc  v18.16b,v18.16b
489         ld1     {v19.16b},[x0],#16
490         aesd    v0.16b,v23.16b
491         aesd    v1.16b,v23.16b
492         aesd    v18.16b,v23.16b
493         ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
494         add     w6,w5,#2
495         eor     v4.16b,v4.16b,v0.16b
496         eor     v5.16b,v5.16b,v1.16b
497         eor     v18.16b,v18.16b,v17.16b
498         ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
499         st1     {v4.16b},[x1],#16
500         orr     v0.16b,v2.16b,v2.16b
501         st1     {v5.16b},[x1],#16
502         orr     v1.16b,v3.16b,v3.16b
503         st1     {v18.16b},[x1],#16
504         orr     v18.16b,v19.16b,v19.16b
505         b.hs    .Loop3x_cbc_dec
506
507         cmn     x2,#0x30
508         b.eq    .Lcbc_done
509         nop
510
511 .Lcbc_dec_tail:
512         aesd    v1.16b,v16.16b
513         aesimc  v1.16b,v1.16b
514         aesd    v18.16b,v16.16b
515         aesimc  v18.16b,v18.16b
516         ld1     {v16.4s},[x7],#16
517         subs    w6,w6,#2
518         aesd    v1.16b,v17.16b
519         aesimc  v1.16b,v1.16b
520         aesd    v18.16b,v17.16b
521         aesimc  v18.16b,v18.16b
522         ld1     {v17.4s},[x7],#16
523         b.gt    .Lcbc_dec_tail
524
525         aesd    v1.16b,v16.16b
526         aesimc  v1.16b,v1.16b
527         aesd    v18.16b,v16.16b
528         aesimc  v18.16b,v18.16b
529         aesd    v1.16b,v17.16b
530         aesimc  v1.16b,v1.16b
531         aesd    v18.16b,v17.16b
532         aesimc  v18.16b,v18.16b
533         aesd    v1.16b,v20.16b
534         aesimc  v1.16b,v1.16b
535         aesd    v18.16b,v20.16b
536         aesimc  v18.16b,v18.16b
537         cmn     x2,#0x20
538         aesd    v1.16b,v21.16b
539         aesimc  v1.16b,v1.16b
540         aesd    v18.16b,v21.16b
541         aesimc  v18.16b,v18.16b
542         eor     v5.16b,v6.16b,v7.16b
543         aesd    v1.16b,v22.16b
544         aesimc  v1.16b,v1.16b
545         aesd    v18.16b,v22.16b
546         aesimc  v18.16b,v18.16b
547         eor     v17.16b,v3.16b,v7.16b
548         aesd    v1.16b,v23.16b
549         aesd    v18.16b,v23.16b
550         b.eq    .Lcbc_dec_one
551         eor     v5.16b,v5.16b,v1.16b
552         eor     v17.16b,v17.16b,v18.16b
553         orr     v6.16b,v19.16b,v19.16b
554         st1     {v5.16b},[x1],#16
555         st1     {v17.16b},[x1],#16
556         b       .Lcbc_done
557
558 .Lcbc_dec_one:
559         eor     v5.16b,v5.16b,v18.16b
560         orr     v6.16b,v19.16b,v19.16b
561         st1     {v5.16b},[x1],#16
562
563 .Lcbc_done:
564         st1     {v6.16b},[x4]
565 .Lcbc_abort:
566         ldr     x29,[sp],#16
567         ret
568 .size   aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
569 .globl  aes_v8_ctr32_encrypt_blocks
570 .type   aes_v8_ctr32_encrypt_blocks,%function
571 .align  5
572 aes_v8_ctr32_encrypt_blocks:
573         stp     x29,x30,[sp,#-16]!
574         add     x29,sp,#0
575         ldr     w5,[x3,#240]
576
577         ldr     w8, [x4, #12]
578         ld1     {v0.4s},[x4]
579
580         ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
581         sub     w5,w5,#4
582         mov     x12,#16
583         cmp     x2,#2
584         add     x7,x3,x5,lsl#4  // pointer to last 5 round keys
585         sub     w5,w5,#2
586         ld1     {v20.4s,v21.4s},[x7],#32
587         ld1     {v22.4s,v23.4s},[x7],#32
588         ld1     {v7.4s},[x7]
589         add     x7,x3,#32
590         mov     w6,w5
591         csel    x12,xzr,x12,lo
592 #ifndef __ARMEB__
593         rev     w8, w8
594 #endif
595         orr     v1.16b,v0.16b,v0.16b
596         add     w10, w8, #1
597         orr     v18.16b,v0.16b,v0.16b
598         add     w8, w8, #2
599         orr     v6.16b,v0.16b,v0.16b
600         rev     w10, w10
601         mov     v1.s[3],w10
602         b.ls    .Lctr32_tail
603         rev     w12, w8
604         sub     x2,x2,#3                // bias
605         mov     v18.s[3],w12
606         b       .Loop3x_ctr32
607
608 .align  4
609 .Loop3x_ctr32:
610         aese    v0.16b,v16.16b
611         aesmc   v0.16b,v0.16b
612         aese    v1.16b,v16.16b
613         aesmc   v1.16b,v1.16b
614         aese    v18.16b,v16.16b
615         aesmc   v18.16b,v18.16b
616         ld1     {v16.4s},[x7],#16
617         subs    w6,w6,#2
618         aese    v0.16b,v17.16b
619         aesmc   v0.16b,v0.16b
620         aese    v1.16b,v17.16b
621         aesmc   v1.16b,v1.16b
622         aese    v18.16b,v17.16b
623         aesmc   v18.16b,v18.16b
624         ld1     {v17.4s},[x7],#16
625         b.gt    .Loop3x_ctr32
626
627         aese    v0.16b,v16.16b
628         aesmc   v4.16b,v0.16b
629         aese    v1.16b,v16.16b
630         aesmc   v5.16b,v1.16b
631         ld1     {v2.16b},[x0],#16
632         orr     v0.16b,v6.16b,v6.16b
633         aese    v18.16b,v16.16b
634         aesmc   v18.16b,v18.16b
635         ld1     {v3.16b},[x0],#16
636         orr     v1.16b,v6.16b,v6.16b
637         aese    v4.16b,v17.16b
638         aesmc   v4.16b,v4.16b
639         aese    v5.16b,v17.16b
640         aesmc   v5.16b,v5.16b
641         ld1     {v19.16b},[x0],#16
642         mov     x7,x3
643         aese    v18.16b,v17.16b
644         aesmc   v17.16b,v18.16b
645         orr     v18.16b,v6.16b,v6.16b
646         add     w9,w8,#1
647         aese    v4.16b,v20.16b
648         aesmc   v4.16b,v4.16b
649         aese    v5.16b,v20.16b
650         aesmc   v5.16b,v5.16b
651         eor     v2.16b,v2.16b,v7.16b
652         add     w10,w8,#2
653         aese    v17.16b,v20.16b
654         aesmc   v17.16b,v17.16b
655         eor     v3.16b,v3.16b,v7.16b
656         add     w8,w8,#3
657         aese    v4.16b,v21.16b
658         aesmc   v4.16b,v4.16b
659         aese    v5.16b,v21.16b
660         aesmc   v5.16b,v5.16b
661         eor     v19.16b,v19.16b,v7.16b
662         rev     w9,w9
663         aese    v17.16b,v21.16b
664         aesmc   v17.16b,v17.16b
665         mov     v0.s[3], w9
666         rev     w10,w10
667         aese    v4.16b,v22.16b
668         aesmc   v4.16b,v4.16b
669         aese    v5.16b,v22.16b
670         aesmc   v5.16b,v5.16b
671         mov     v1.s[3], w10
672         rev     w12,w8
673         aese    v17.16b,v22.16b
674         aesmc   v17.16b,v17.16b
675         mov     v18.s[3], w12
676         subs    x2,x2,#3
677         aese    v4.16b,v23.16b
678         aese    v5.16b,v23.16b
679         aese    v17.16b,v23.16b
680
681         eor     v2.16b,v2.16b,v4.16b
682         ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
683         st1     {v2.16b},[x1],#16
684         eor     v3.16b,v3.16b,v5.16b
685         mov     w6,w5
686         st1     {v3.16b},[x1],#16
687         eor     v19.16b,v19.16b,v17.16b
688         ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
689         st1     {v19.16b},[x1],#16
690         b.hs    .Loop3x_ctr32
691
692         adds    x2,x2,#3
693         b.eq    .Lctr32_done
694         cmp     x2,#1
695         mov     x12,#16
696         csel    x12,xzr,x12,eq
697
698 .Lctr32_tail:
699         aese    v0.16b,v16.16b
700         aesmc   v0.16b,v0.16b
701         aese    v1.16b,v16.16b
702         aesmc   v1.16b,v1.16b
703         ld1     {v16.4s},[x7],#16
704         subs    w6,w6,#2
705         aese    v0.16b,v17.16b
706         aesmc   v0.16b,v0.16b
707         aese    v1.16b,v17.16b
708         aesmc   v1.16b,v1.16b
709         ld1     {v17.4s},[x7],#16
710         b.gt    .Lctr32_tail
711
712         aese    v0.16b,v16.16b
713         aesmc   v0.16b,v0.16b
714         aese    v1.16b,v16.16b
715         aesmc   v1.16b,v1.16b
716         aese    v0.16b,v17.16b
717         aesmc   v0.16b,v0.16b
718         aese    v1.16b,v17.16b
719         aesmc   v1.16b,v1.16b
720         ld1     {v2.16b},[x0],x12
721         aese    v0.16b,v20.16b
722         aesmc   v0.16b,v0.16b
723         aese    v1.16b,v20.16b
724         aesmc   v1.16b,v1.16b
725         ld1     {v3.16b},[x0]
726         aese    v0.16b,v21.16b
727         aesmc   v0.16b,v0.16b
728         aese    v1.16b,v21.16b
729         aesmc   v1.16b,v1.16b
730         eor     v2.16b,v2.16b,v7.16b
731         aese    v0.16b,v22.16b
732         aesmc   v0.16b,v0.16b
733         aese    v1.16b,v22.16b
734         aesmc   v1.16b,v1.16b
735         eor     v3.16b,v3.16b,v7.16b
736         aese    v0.16b,v23.16b
737         aese    v1.16b,v23.16b
738
739         cmp     x2,#1
740         eor     v2.16b,v2.16b,v0.16b
741         eor     v3.16b,v3.16b,v1.16b
742         st1     {v2.16b},[x1],#16
743         b.eq    .Lctr32_done
744         st1     {v3.16b},[x1]
745
746 .Lctr32_done:
747         ldr     x29,[sp],#16
748         ret
749 .size   aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
750 #endif