]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/openssl/aarch64/aesv8-armx.S
Merge OpenSSL 1.1.1i.
[FreeBSD/FreeBSD.git] / sys / crypto / openssl / aarch64 / aesv8-armx.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from aesv8-armx.pl. */
3 #include "arm_arch.h"
4
5 #if __ARM_MAX_ARCH__>=7
6 .text
7 .align  5
8 .Lrcon:
9 .long   0x01,0x01,0x01,0x01
10 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
11 .long   0x1b,0x1b,0x1b,0x1b
12
13 .globl  aes_v8_set_encrypt_key
14 .type   aes_v8_set_encrypt_key,%function
15 .align  5
16 aes_v8_set_encrypt_key:
17 .Lenc_key:
18         stp     x29,x30,[sp,#-16]!
19         add     x29,sp,#0
20         mov     x3,#-1
21         cmp     x0,#0
22         b.eq    .Lenc_key_abort
23         cmp     x2,#0
24         b.eq    .Lenc_key_abort
25         mov     x3,#-2
26         cmp     w1,#128
27         b.lt    .Lenc_key_abort
28         cmp     w1,#256
29         b.gt    .Lenc_key_abort
30         tst     w1,#0x3f
31         b.ne    .Lenc_key_abort
32
33         adr     x3,.Lrcon
34         cmp     w1,#192
35
36         eor     v0.16b,v0.16b,v0.16b
37         ld1     {v3.16b},[x0],#16
38         mov     w1,#8           // reuse w1
39         ld1     {v1.4s,v2.4s},[x3],#32
40
41         b.lt    .Loop128
42         b.eq    .L192
43         b       .L256
44
45 .align  4
46 .Loop128:
47         tbl     v6.16b,{v3.16b},v2.16b
48         ext     v5.16b,v0.16b,v3.16b,#12
49         st1     {v3.4s},[x2],#16
50         aese    v6.16b,v0.16b
51         subs    w1,w1,#1
52
53         eor     v3.16b,v3.16b,v5.16b
54         ext     v5.16b,v0.16b,v5.16b,#12
55         eor     v3.16b,v3.16b,v5.16b
56         ext     v5.16b,v0.16b,v5.16b,#12
57         eor     v6.16b,v6.16b,v1.16b
58         eor     v3.16b,v3.16b,v5.16b
59         shl     v1.16b,v1.16b,#1
60         eor     v3.16b,v3.16b,v6.16b
61         b.ne    .Loop128
62
63         ld1     {v1.4s},[x3]
64
65         tbl     v6.16b,{v3.16b},v2.16b
66         ext     v5.16b,v0.16b,v3.16b,#12
67         st1     {v3.4s},[x2],#16
68         aese    v6.16b,v0.16b
69
70         eor     v3.16b,v3.16b,v5.16b
71         ext     v5.16b,v0.16b,v5.16b,#12
72         eor     v3.16b,v3.16b,v5.16b
73         ext     v5.16b,v0.16b,v5.16b,#12
74         eor     v6.16b,v6.16b,v1.16b
75         eor     v3.16b,v3.16b,v5.16b
76         shl     v1.16b,v1.16b,#1
77         eor     v3.16b,v3.16b,v6.16b
78
79         tbl     v6.16b,{v3.16b},v2.16b
80         ext     v5.16b,v0.16b,v3.16b,#12
81         st1     {v3.4s},[x2],#16
82         aese    v6.16b,v0.16b
83
84         eor     v3.16b,v3.16b,v5.16b
85         ext     v5.16b,v0.16b,v5.16b,#12
86         eor     v3.16b,v3.16b,v5.16b
87         ext     v5.16b,v0.16b,v5.16b,#12
88         eor     v6.16b,v6.16b,v1.16b
89         eor     v3.16b,v3.16b,v5.16b
90         eor     v3.16b,v3.16b,v6.16b
91         st1     {v3.4s},[x2]
92         add     x2,x2,#0x50
93
94         mov     w12,#10
95         b       .Ldone
96
97 .align  4
98 .L192:
99         ld1     {v4.8b},[x0],#8
100         movi    v6.16b,#8                       // borrow v6.16b
101         st1     {v3.4s},[x2],#16
102         sub     v2.16b,v2.16b,v6.16b    // adjust the mask
103
104 .Loop192:
105         tbl     v6.16b,{v4.16b},v2.16b
106         ext     v5.16b,v0.16b,v3.16b,#12
107 #ifdef __ARMEB__
108         st1     {v4.4s},[x2],#16
109         sub     x2,x2,#8
110 #else
111         st1     {v4.8b},[x2],#8
112 #endif
113         aese    v6.16b,v0.16b
114         subs    w1,w1,#1
115
116         eor     v3.16b,v3.16b,v5.16b
117         ext     v5.16b,v0.16b,v5.16b,#12
118         eor     v3.16b,v3.16b,v5.16b
119         ext     v5.16b,v0.16b,v5.16b,#12
120         eor     v3.16b,v3.16b,v5.16b
121
122         dup     v5.4s,v3.s[3]
123         eor     v5.16b,v5.16b,v4.16b
124         eor     v6.16b,v6.16b,v1.16b
125         ext     v4.16b,v0.16b,v4.16b,#12
126         shl     v1.16b,v1.16b,#1
127         eor     v4.16b,v4.16b,v5.16b
128         eor     v3.16b,v3.16b,v6.16b
129         eor     v4.16b,v4.16b,v6.16b
130         st1     {v3.4s},[x2],#16
131         b.ne    .Loop192
132
133         mov     w12,#12
134         add     x2,x2,#0x20
135         b       .Ldone
136
137 .align  4
138 .L256:
139         ld1     {v4.16b},[x0]
140         mov     w1,#7
141         mov     w12,#14
142         st1     {v3.4s},[x2],#16
143
144 .Loop256:
145         tbl     v6.16b,{v4.16b},v2.16b
146         ext     v5.16b,v0.16b,v3.16b,#12
147         st1     {v4.4s},[x2],#16
148         aese    v6.16b,v0.16b
149         subs    w1,w1,#1
150
151         eor     v3.16b,v3.16b,v5.16b
152         ext     v5.16b,v0.16b,v5.16b,#12
153         eor     v3.16b,v3.16b,v5.16b
154         ext     v5.16b,v0.16b,v5.16b,#12
155         eor     v6.16b,v6.16b,v1.16b
156         eor     v3.16b,v3.16b,v5.16b
157         shl     v1.16b,v1.16b,#1
158         eor     v3.16b,v3.16b,v6.16b
159         st1     {v3.4s},[x2],#16
160         b.eq    .Ldone
161
162         dup     v6.4s,v3.s[3]           // just splat
163         ext     v5.16b,v0.16b,v4.16b,#12
164         aese    v6.16b,v0.16b
165
166         eor     v4.16b,v4.16b,v5.16b
167         ext     v5.16b,v0.16b,v5.16b,#12
168         eor     v4.16b,v4.16b,v5.16b
169         ext     v5.16b,v0.16b,v5.16b,#12
170         eor     v4.16b,v4.16b,v5.16b
171
172         eor     v4.16b,v4.16b,v6.16b
173         b       .Loop256
174
175 .Ldone:
176         str     w12,[x2]
177         mov     x3,#0
178
179 .Lenc_key_abort:
180         mov     x0,x3                   // return value
181         ldr     x29,[sp],#16
182         ret
183 .size   aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
184
185 .globl  aes_v8_set_decrypt_key
186 .type   aes_v8_set_decrypt_key,%function
187 .align  5
188 aes_v8_set_decrypt_key:
189 .inst   0xd503233f              // paciasp
190         stp     x29,x30,[sp,#-16]!
191         add     x29,sp,#0
192         bl      .Lenc_key
193
194         cmp     x0,#0
195         b.ne    .Ldec_key_abort
196
197         sub     x2,x2,#240              // restore original x2
198         mov     x4,#-16
199         add     x0,x2,x12,lsl#4 // end of key schedule
200
201         ld1     {v0.4s},[x2]
202         ld1     {v1.4s},[x0]
203         st1     {v0.4s},[x0],x4
204         st1     {v1.4s},[x2],#16
205
206 .Loop_imc:
207         ld1     {v0.4s},[x2]
208         ld1     {v1.4s},[x0]
209         aesimc  v0.16b,v0.16b
210         aesimc  v1.16b,v1.16b
211         st1     {v0.4s},[x0],x4
212         st1     {v1.4s},[x2],#16
213         cmp     x0,x2
214         b.hi    .Loop_imc
215
216         ld1     {v0.4s},[x2]
217         aesimc  v0.16b,v0.16b
218         st1     {v0.4s},[x0]
219
220         eor     x0,x0,x0                // return value
221 .Ldec_key_abort:
222         ldp     x29,x30,[sp],#16
223 .inst   0xd50323bf              // autiasp
224         ret
225 .size   aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
226 .globl  aes_v8_encrypt
227 .type   aes_v8_encrypt,%function
228 .align  5
229 aes_v8_encrypt:
230         ldr     w3,[x2,#240]
231         ld1     {v0.4s},[x2],#16
232         ld1     {v2.16b},[x0]
233         sub     w3,w3,#2
234         ld1     {v1.4s},[x2],#16
235
236 .Loop_enc:
237         aese    v2.16b,v0.16b
238         aesmc   v2.16b,v2.16b
239         ld1     {v0.4s},[x2],#16
240         subs    w3,w3,#2
241         aese    v2.16b,v1.16b
242         aesmc   v2.16b,v2.16b
243         ld1     {v1.4s},[x2],#16
244         b.gt    .Loop_enc
245
246         aese    v2.16b,v0.16b
247         aesmc   v2.16b,v2.16b
248         ld1     {v0.4s},[x2]
249         aese    v2.16b,v1.16b
250         eor     v2.16b,v2.16b,v0.16b
251
252         st1     {v2.16b},[x1]
253         ret
254 .size   aes_v8_encrypt,.-aes_v8_encrypt
255 .globl  aes_v8_decrypt
256 .type   aes_v8_decrypt,%function
257 .align  5
258 aes_v8_decrypt:
259         ldr     w3,[x2,#240]
260         ld1     {v0.4s},[x2],#16
261         ld1     {v2.16b},[x0]
262         sub     w3,w3,#2
263         ld1     {v1.4s},[x2],#16
264
265 .Loop_dec:
266         aesd    v2.16b,v0.16b
267         aesimc  v2.16b,v2.16b
268         ld1     {v0.4s},[x2],#16
269         subs    w3,w3,#2
270         aesd    v2.16b,v1.16b
271         aesimc  v2.16b,v2.16b
272         ld1     {v1.4s},[x2],#16
273         b.gt    .Loop_dec
274
275         aesd    v2.16b,v0.16b
276         aesimc  v2.16b,v2.16b
277         ld1     {v0.4s},[x2]
278         aesd    v2.16b,v1.16b
279         eor     v2.16b,v2.16b,v0.16b
280
281         st1     {v2.16b},[x1]
282         ret
283 .size   aes_v8_decrypt,.-aes_v8_decrypt
284 .globl  aes_v8_cbc_encrypt
285 .type   aes_v8_cbc_encrypt,%function
286 .align  5
287 aes_v8_cbc_encrypt:
288         stp     x29,x30,[sp,#-16]!
289         add     x29,sp,#0
290         subs    x2,x2,#16
291         mov     x8,#16
292         b.lo    .Lcbc_abort
293         csel    x8,xzr,x8,eq
294
295         cmp     w5,#0                   // en- or decrypting?
296         ldr     w5,[x3,#240]
297         and     x2,x2,#-16
298         ld1     {v6.16b},[x4]
299         ld1     {v0.16b},[x0],x8
300
301         ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
302         sub     w5,w5,#6
303         add     x7,x3,x5,lsl#4  // pointer to last 7 round keys
304         sub     w5,w5,#2
305         ld1     {v18.4s,v19.4s},[x7],#32
306         ld1     {v20.4s,v21.4s},[x7],#32
307         ld1     {v22.4s,v23.4s},[x7],#32
308         ld1     {v7.4s},[x7]
309
310         add     x7,x3,#32
311         mov     w6,w5
312         b.eq    .Lcbc_dec
313
314         cmp     w5,#2
315         eor     v0.16b,v0.16b,v6.16b
316         eor     v5.16b,v16.16b,v7.16b
317         b.eq    .Lcbc_enc128
318
319         ld1     {v2.4s,v3.4s},[x7]
320         add     x7,x3,#16
321         add     x6,x3,#16*4
322         add     x12,x3,#16*5
323         aese    v0.16b,v16.16b
324         aesmc   v0.16b,v0.16b
325         add     x14,x3,#16*6
326         add     x3,x3,#16*7
327         b       .Lenter_cbc_enc
328
329 .align  4
330 .Loop_cbc_enc:
331         aese    v0.16b,v16.16b
332         aesmc   v0.16b,v0.16b
333         st1     {v6.16b},[x1],#16
334 .Lenter_cbc_enc:
335         aese    v0.16b,v17.16b
336         aesmc   v0.16b,v0.16b
337         aese    v0.16b,v2.16b
338         aesmc   v0.16b,v0.16b
339         ld1     {v16.4s},[x6]
340         cmp     w5,#4
341         aese    v0.16b,v3.16b
342         aesmc   v0.16b,v0.16b
343         ld1     {v17.4s},[x12]
344         b.eq    .Lcbc_enc192
345
346         aese    v0.16b,v16.16b
347         aesmc   v0.16b,v0.16b
348         ld1     {v16.4s},[x14]
349         aese    v0.16b,v17.16b
350         aesmc   v0.16b,v0.16b
351         ld1     {v17.4s},[x3]
352         nop
353
354 .Lcbc_enc192:
355         aese    v0.16b,v16.16b
356         aesmc   v0.16b,v0.16b
357         subs    x2,x2,#16
358         aese    v0.16b,v17.16b
359         aesmc   v0.16b,v0.16b
360         csel    x8,xzr,x8,eq
361         aese    v0.16b,v18.16b
362         aesmc   v0.16b,v0.16b
363         aese    v0.16b,v19.16b
364         aesmc   v0.16b,v0.16b
365         ld1     {v16.16b},[x0],x8
366         aese    v0.16b,v20.16b
367         aesmc   v0.16b,v0.16b
368         eor     v16.16b,v16.16b,v5.16b
369         aese    v0.16b,v21.16b
370         aesmc   v0.16b,v0.16b
371         ld1     {v17.4s},[x7]           // re-pre-load rndkey[1]
372         aese    v0.16b,v22.16b
373         aesmc   v0.16b,v0.16b
374         aese    v0.16b,v23.16b
375         eor     v6.16b,v0.16b,v7.16b
376         b.hs    .Loop_cbc_enc
377
378         st1     {v6.16b},[x1],#16
379         b       .Lcbc_done
380
381 .align  5
382 .Lcbc_enc128:
383         ld1     {v2.4s,v3.4s},[x7]
384         aese    v0.16b,v16.16b
385         aesmc   v0.16b,v0.16b
386         b       .Lenter_cbc_enc128
387 .Loop_cbc_enc128:
388         aese    v0.16b,v16.16b
389         aesmc   v0.16b,v0.16b
390         st1     {v6.16b},[x1],#16
391 .Lenter_cbc_enc128:
392         aese    v0.16b,v17.16b
393         aesmc   v0.16b,v0.16b
394         subs    x2,x2,#16
395         aese    v0.16b,v2.16b
396         aesmc   v0.16b,v0.16b
397         csel    x8,xzr,x8,eq
398         aese    v0.16b,v3.16b
399         aesmc   v0.16b,v0.16b
400         aese    v0.16b,v18.16b
401         aesmc   v0.16b,v0.16b
402         aese    v0.16b,v19.16b
403         aesmc   v0.16b,v0.16b
404         ld1     {v16.16b},[x0],x8
405         aese    v0.16b,v20.16b
406         aesmc   v0.16b,v0.16b
407         aese    v0.16b,v21.16b
408         aesmc   v0.16b,v0.16b
409         aese    v0.16b,v22.16b
410         aesmc   v0.16b,v0.16b
411         eor     v16.16b,v16.16b,v5.16b
412         aese    v0.16b,v23.16b
413         eor     v6.16b,v0.16b,v7.16b
414         b.hs    .Loop_cbc_enc128
415
416         st1     {v6.16b},[x1],#16
417         b       .Lcbc_done
418 .align  5
419 .Lcbc_dec:
420         ld1     {v18.16b},[x0],#16
421         subs    x2,x2,#32               // bias
422         add     w6,w5,#2
423         orr     v3.16b,v0.16b,v0.16b
424         orr     v1.16b,v0.16b,v0.16b
425         orr     v19.16b,v18.16b,v18.16b
426         b.lo    .Lcbc_dec_tail
427
428         orr     v1.16b,v18.16b,v18.16b
429         ld1     {v18.16b},[x0],#16
430         orr     v2.16b,v0.16b,v0.16b
431         orr     v3.16b,v1.16b,v1.16b
432         orr     v19.16b,v18.16b,v18.16b
433
434 .Loop3x_cbc_dec:
435         aesd    v0.16b,v16.16b
436         aesimc  v0.16b,v0.16b
437         aesd    v1.16b,v16.16b
438         aesimc  v1.16b,v1.16b
439         aesd    v18.16b,v16.16b
440         aesimc  v18.16b,v18.16b
441         ld1     {v16.4s},[x7],#16
442         subs    w6,w6,#2
443         aesd    v0.16b,v17.16b
444         aesimc  v0.16b,v0.16b
445         aesd    v1.16b,v17.16b
446         aesimc  v1.16b,v1.16b
447         aesd    v18.16b,v17.16b
448         aesimc  v18.16b,v18.16b
449         ld1     {v17.4s},[x7],#16
450         b.gt    .Loop3x_cbc_dec
451
452         aesd    v0.16b,v16.16b
453         aesimc  v0.16b,v0.16b
454         aesd    v1.16b,v16.16b
455         aesimc  v1.16b,v1.16b
456         aesd    v18.16b,v16.16b
457         aesimc  v18.16b,v18.16b
458         eor     v4.16b,v6.16b,v7.16b
459         subs    x2,x2,#0x30
460         eor     v5.16b,v2.16b,v7.16b
461         csel    x6,x2,x6,lo                     // x6, w6, is zero at this point
462         aesd    v0.16b,v17.16b
463         aesimc  v0.16b,v0.16b
464         aesd    v1.16b,v17.16b
465         aesimc  v1.16b,v1.16b
466         aesd    v18.16b,v17.16b
467         aesimc  v18.16b,v18.16b
468         eor     v17.16b,v3.16b,v7.16b
469         add     x0,x0,x6                // x0 is adjusted in such way that
470                                         // at exit from the loop v1.16b-v18.16b
471                                         // are loaded with last "words"
472         orr     v6.16b,v19.16b,v19.16b
473         mov     x7,x3
474         aesd    v0.16b,v20.16b
475         aesimc  v0.16b,v0.16b
476         aesd    v1.16b,v20.16b
477         aesimc  v1.16b,v1.16b
478         aesd    v18.16b,v20.16b
479         aesimc  v18.16b,v18.16b
480         ld1     {v2.16b},[x0],#16
481         aesd    v0.16b,v21.16b
482         aesimc  v0.16b,v0.16b
483         aesd    v1.16b,v21.16b
484         aesimc  v1.16b,v1.16b
485         aesd    v18.16b,v21.16b
486         aesimc  v18.16b,v18.16b
487         ld1     {v3.16b},[x0],#16
488         aesd    v0.16b,v22.16b
489         aesimc  v0.16b,v0.16b
490         aesd    v1.16b,v22.16b
491         aesimc  v1.16b,v1.16b
492         aesd    v18.16b,v22.16b
493         aesimc  v18.16b,v18.16b
494         ld1     {v19.16b},[x0],#16
495         aesd    v0.16b,v23.16b
496         aesd    v1.16b,v23.16b
497         aesd    v18.16b,v23.16b
498         ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
499         add     w6,w5,#2
500         eor     v4.16b,v4.16b,v0.16b
501         eor     v5.16b,v5.16b,v1.16b
502         eor     v18.16b,v18.16b,v17.16b
503         ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
504         st1     {v4.16b},[x1],#16
505         orr     v0.16b,v2.16b,v2.16b
506         st1     {v5.16b},[x1],#16
507         orr     v1.16b,v3.16b,v3.16b
508         st1     {v18.16b},[x1],#16
509         orr     v18.16b,v19.16b,v19.16b
510         b.hs    .Loop3x_cbc_dec
511
512         cmn     x2,#0x30
513         b.eq    .Lcbc_done
514         nop
515
516 .Lcbc_dec_tail:
517         aesd    v1.16b,v16.16b
518         aesimc  v1.16b,v1.16b
519         aesd    v18.16b,v16.16b
520         aesimc  v18.16b,v18.16b
521         ld1     {v16.4s},[x7],#16
522         subs    w6,w6,#2
523         aesd    v1.16b,v17.16b
524         aesimc  v1.16b,v1.16b
525         aesd    v18.16b,v17.16b
526         aesimc  v18.16b,v18.16b
527         ld1     {v17.4s},[x7],#16
528         b.gt    .Lcbc_dec_tail
529
530         aesd    v1.16b,v16.16b
531         aesimc  v1.16b,v1.16b
532         aesd    v18.16b,v16.16b
533         aesimc  v18.16b,v18.16b
534         aesd    v1.16b,v17.16b
535         aesimc  v1.16b,v1.16b
536         aesd    v18.16b,v17.16b
537         aesimc  v18.16b,v18.16b
538         aesd    v1.16b,v20.16b
539         aesimc  v1.16b,v1.16b
540         aesd    v18.16b,v20.16b
541         aesimc  v18.16b,v18.16b
542         cmn     x2,#0x20
543         aesd    v1.16b,v21.16b
544         aesimc  v1.16b,v1.16b
545         aesd    v18.16b,v21.16b
546         aesimc  v18.16b,v18.16b
547         eor     v5.16b,v6.16b,v7.16b
548         aesd    v1.16b,v22.16b
549         aesimc  v1.16b,v1.16b
550         aesd    v18.16b,v22.16b
551         aesimc  v18.16b,v18.16b
552         eor     v17.16b,v3.16b,v7.16b
553         aesd    v1.16b,v23.16b
554         aesd    v18.16b,v23.16b
555         b.eq    .Lcbc_dec_one
556         eor     v5.16b,v5.16b,v1.16b
557         eor     v17.16b,v17.16b,v18.16b
558         orr     v6.16b,v19.16b,v19.16b
559         st1     {v5.16b},[x1],#16
560         st1     {v17.16b},[x1],#16
561         b       .Lcbc_done
562
563 .Lcbc_dec_one:
564         eor     v5.16b,v5.16b,v18.16b
565         orr     v6.16b,v19.16b,v19.16b
566         st1     {v5.16b},[x1],#16
567
568 .Lcbc_done:
569         st1     {v6.16b},[x4]
570 .Lcbc_abort:
571         ldr     x29,[sp],#16
572         ret
573 .size   aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
574 .globl  aes_v8_ctr32_encrypt_blocks
575 .type   aes_v8_ctr32_encrypt_blocks,%function
576 .align  5
577 aes_v8_ctr32_encrypt_blocks:
578         stp     x29,x30,[sp,#-16]!
579         add     x29,sp,#0
580         ldr     w5,[x3,#240]
581
582         ldr     w8, [x4, #12]
583 #ifdef __ARMEB__
584         ld1     {v0.16b},[x4]
585 #else
586         ld1     {v0.4s},[x4]
587 #endif
588         ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
589         sub     w5,w5,#4
590         mov     x12,#16
591         cmp     x2,#2
592         add     x7,x3,x5,lsl#4  // pointer to last 5 round keys
593         sub     w5,w5,#2
594         ld1     {v20.4s,v21.4s},[x7],#32
595         ld1     {v22.4s,v23.4s},[x7],#32
596         ld1     {v7.4s},[x7]
597         add     x7,x3,#32
598         mov     w6,w5
599         csel    x12,xzr,x12,lo
600 #ifndef __ARMEB__
601         rev     w8, w8
602 #endif
603         add     w10, w8, #1
604         orr     v6.16b,v0.16b,v0.16b
605         rev     w10, w10
606         mov     v6.s[3],w10
607         add     w8, w8, #2
608         orr     v1.16b,v6.16b,v6.16b
609         b.ls    .Lctr32_tail
610         rev     w12, w8
611         mov     v6.s[3],w12
612         sub     x2,x2,#3                // bias
613         orr     v18.16b,v6.16b,v6.16b
614         b       .Loop3x_ctr32
615
616 .align  4
617 .Loop3x_ctr32:
618         aese    v0.16b,v16.16b
619         aesmc   v0.16b,v0.16b
620         aese    v1.16b,v16.16b
621         aesmc   v1.16b,v1.16b
622         aese    v18.16b,v16.16b
623         aesmc   v18.16b,v18.16b
624         ld1     {v16.4s},[x7],#16
625         subs    w6,w6,#2
626         aese    v0.16b,v17.16b
627         aesmc   v0.16b,v0.16b
628         aese    v1.16b,v17.16b
629         aesmc   v1.16b,v1.16b
630         aese    v18.16b,v17.16b
631         aesmc   v18.16b,v18.16b
632         ld1     {v17.4s},[x7],#16
633         b.gt    .Loop3x_ctr32
634
635         aese    v0.16b,v16.16b
636         aesmc   v4.16b,v0.16b
637         aese    v1.16b,v16.16b
638         aesmc   v5.16b,v1.16b
639         ld1     {v2.16b},[x0],#16
640         add     w9,w8,#1
641         aese    v18.16b,v16.16b
642         aesmc   v18.16b,v18.16b
643         ld1     {v3.16b},[x0],#16
644         rev     w9,w9
645         aese    v4.16b,v17.16b
646         aesmc   v4.16b,v4.16b
647         aese    v5.16b,v17.16b
648         aesmc   v5.16b,v5.16b
649         ld1     {v19.16b},[x0],#16
650         mov     x7,x3
651         aese    v18.16b,v17.16b
652         aesmc   v17.16b,v18.16b
653         aese    v4.16b,v20.16b
654         aesmc   v4.16b,v4.16b
655         aese    v5.16b,v20.16b
656         aesmc   v5.16b,v5.16b
657         eor     v2.16b,v2.16b,v7.16b
658         add     w10,w8,#2
659         aese    v17.16b,v20.16b
660         aesmc   v17.16b,v17.16b
661         eor     v3.16b,v3.16b,v7.16b
662         add     w8,w8,#3
663         aese    v4.16b,v21.16b
664         aesmc   v4.16b,v4.16b
665         aese    v5.16b,v21.16b
666         aesmc   v5.16b,v5.16b
667         eor     v19.16b,v19.16b,v7.16b
668         mov     v6.s[3], w9
669         aese    v17.16b,v21.16b
670         aesmc   v17.16b,v17.16b
671         orr     v0.16b,v6.16b,v6.16b
672         rev     w10,w10
673         aese    v4.16b,v22.16b
674         aesmc   v4.16b,v4.16b
675         mov     v6.s[3], w10
676         rev     w12,w8
677         aese    v5.16b,v22.16b
678         aesmc   v5.16b,v5.16b
679         orr     v1.16b,v6.16b,v6.16b
680         mov     v6.s[3], w12
681         aese    v17.16b,v22.16b
682         aesmc   v17.16b,v17.16b
683         orr     v18.16b,v6.16b,v6.16b
684         subs    x2,x2,#3
685         aese    v4.16b,v23.16b
686         aese    v5.16b,v23.16b
687         aese    v17.16b,v23.16b
688
689         eor     v2.16b,v2.16b,v4.16b
690         ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
691         st1     {v2.16b},[x1],#16
692         eor     v3.16b,v3.16b,v5.16b
693         mov     w6,w5
694         st1     {v3.16b},[x1],#16
695         eor     v19.16b,v19.16b,v17.16b
696         ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
697         st1     {v19.16b},[x1],#16
698         b.hs    .Loop3x_ctr32
699
700         adds    x2,x2,#3
701         b.eq    .Lctr32_done
702         cmp     x2,#1
703         mov     x12,#16
704         csel    x12,xzr,x12,eq
705
706 .Lctr32_tail:
707         aese    v0.16b,v16.16b
708         aesmc   v0.16b,v0.16b
709         aese    v1.16b,v16.16b
710         aesmc   v1.16b,v1.16b
711         ld1     {v16.4s},[x7],#16
712         subs    w6,w6,#2
713         aese    v0.16b,v17.16b
714         aesmc   v0.16b,v0.16b
715         aese    v1.16b,v17.16b
716         aesmc   v1.16b,v1.16b
717         ld1     {v17.4s},[x7],#16
718         b.gt    .Lctr32_tail
719
720         aese    v0.16b,v16.16b
721         aesmc   v0.16b,v0.16b
722         aese    v1.16b,v16.16b
723         aesmc   v1.16b,v1.16b
724         aese    v0.16b,v17.16b
725         aesmc   v0.16b,v0.16b
726         aese    v1.16b,v17.16b
727         aesmc   v1.16b,v1.16b
728         ld1     {v2.16b},[x0],x12
729         aese    v0.16b,v20.16b
730         aesmc   v0.16b,v0.16b
731         aese    v1.16b,v20.16b
732         aesmc   v1.16b,v1.16b
733         ld1     {v3.16b},[x0]
734         aese    v0.16b,v21.16b
735         aesmc   v0.16b,v0.16b
736         aese    v1.16b,v21.16b
737         aesmc   v1.16b,v1.16b
738         eor     v2.16b,v2.16b,v7.16b
739         aese    v0.16b,v22.16b
740         aesmc   v0.16b,v0.16b
741         aese    v1.16b,v22.16b
742         aesmc   v1.16b,v1.16b
743         eor     v3.16b,v3.16b,v7.16b
744         aese    v0.16b,v23.16b
745         aese    v1.16b,v23.16b
746
747         cmp     x2,#1
748         eor     v2.16b,v2.16b,v0.16b
749         eor     v3.16b,v3.16b,v1.16b
750         st1     {v2.16b},[x1],#16
751         b.eq    .Lctr32_done
752         st1     {v3.16b},[x1]
753
754 .Lctr32_done:
755         ldr     x29,[sp],#16
756         ret
757 .size   aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
758 #endif