2 /* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
5 .type _vpaes_consts,%object
6 .align 7 // totally strategic alignment
8 .Lk_mc_forward: // mc_forward
9 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
10 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
11 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
12 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
13 .Lk_mc_backward: // mc_backward
14 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
15 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
16 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
17 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
19 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
20 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
21 .quad 0x0F060D040B020900, 0x070E050C030A0108
22 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
28 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
29 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
30 .Lk_ipt: // input transform (lo, hi)
31 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
32 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
33 .Lk_sbo: // sbou, sbot
34 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
35 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
36 .Lk_sb1: // sb1u, sb1t
37 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
38 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
39 .Lk_sb2: // sb2u, sb2t
40 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
41 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
46 .Lk_dipt: // decryption input transform
47 .quad 0x0F505B040B545F00, 0x154A411E114E451A
48 .quad 0x86E383E660056500, 0x12771772F491F194
49 .Lk_dsbo: // decryption sbox final output
50 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
51 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
52 .Lk_dsb9: // decryption sbox output *9*u, *9*t
53 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
54 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
55 .Lk_dsbd: // decryption sbox output *D*u, *D*t
56 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
57 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
58 .Lk_dsbb: // decryption sbox output *B*u, *B*t
59 .quad 0xD022649296B44200, 0x602646F6B0F2D404
60 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
61 .Lk_dsbe: // decryption sbox output *E*u, *E*t
62 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
63 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
66 // Key schedule constants
68 .Lk_dksd: // decryption key schedule: invskew x*D
69 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
70 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
71 .Lk_dksb: // decryption key schedule: invskew x*B
72 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
73 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
74 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
75 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
76 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
77 .Lk_dks9: // decryption key schedule: invskew x*9
78 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
79 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
82 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
84 .Lk_opt: // output transform
85 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
86 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
87 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
88 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
89 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
91 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
93 .size _vpaes_consts,.-_vpaes_consts
98 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
99 ## and %xmm9-%xmm15 as specified below.
101 .type _vpaes_encrypt_preheat,%function
103 _vpaes_encrypt_preheat:
106 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
107 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
108 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
110 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
115 ## AES-encrypt %xmm0.
119 ## %xmm9-%xmm15 as in _vpaes_preheat
120 ## (%rdx) = scheduled keys
123 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
124 ## Preserves %xmm6 - %xmm8 so you get some local vectors
127 .type _vpaes_encrypt_core,%function
131 ldr w8, [x2,#240] // pull rounds
132 adr x11, .Lk_mc_forward+16
133 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
134 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
135 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
136 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
137 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
138 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
139 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
140 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
141 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
146 // middle of middle round
148 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
149 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
150 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
151 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
152 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
153 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
154 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
155 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
156 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
157 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
158 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
159 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
160 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
161 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
162 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
163 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
164 sub w8, w8, #1 // nr--
168 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
169 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
170 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
171 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
172 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
173 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
174 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
175 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
176 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
177 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
178 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
179 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
180 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
183 // middle of last round
185 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
186 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
187 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
188 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
189 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
190 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
191 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
192 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
194 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
197 .type vpaes_encrypt,%function
200 .inst 0xd503233f // paciasp
201 stp x29,x30,[sp,#-16]!
205 bl _vpaes_encrypt_preheat
206 bl _vpaes_encrypt_core
210 .inst 0xd50323bf // autiasp
212 .size vpaes_encrypt,.-vpaes_encrypt
214 .type _vpaes_encrypt_2x,%function
218 ldr w8, [x2,#240] // pull rounds
219 adr x11, .Lk_mc_forward+16
220 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
221 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
222 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
223 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
224 and v9.16b, v15.16b, v17.16b
225 ushr v8.16b, v15.16b, #4
226 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
227 tbl v9.16b, {v20.16b}, v9.16b
228 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
229 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
230 tbl v10.16b, {v21.16b}, v8.16b
231 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
232 eor v8.16b, v9.16b, v16.16b
233 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
234 eor v8.16b, v8.16b, v10.16b
239 // middle of middle round
241 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
242 tbl v12.16b, {v25.16b}, v10.16b
243 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
244 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
245 tbl v8.16b, {v24.16b}, v11.16b
246 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
247 eor v12.16b, v12.16b, v16.16b
248 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
249 tbl v13.16b, {v27.16b}, v10.16b
250 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
251 eor v8.16b, v8.16b, v12.16b
252 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
253 tbl v10.16b, {v26.16b}, v11.16b
254 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
255 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
256 tbl v11.16b, {v8.16b}, v1.16b
257 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
258 eor v10.16b, v10.16b, v13.16b
259 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
260 tbl v8.16b, {v8.16b}, v4.16b
261 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
262 eor v11.16b, v11.16b, v10.16b
263 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
264 tbl v12.16b, {v11.16b},v1.16b
265 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
266 eor v8.16b, v8.16b, v11.16b
267 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
268 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
269 eor v8.16b, v8.16b, v12.16b
270 sub w8, w8, #1 // nr--
274 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
275 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
276 and v9.16b, v8.16b, v17.16b
277 ushr v8.16b, v8.16b, #4
278 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
279 tbl v13.16b, {v19.16b},v9.16b
280 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
281 eor v9.16b, v9.16b, v8.16b
282 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
283 tbl v11.16b, {v18.16b},v8.16b
284 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
285 tbl v12.16b, {v18.16b},v9.16b
286 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
287 eor v11.16b, v11.16b, v13.16b
288 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
289 eor v12.16b, v12.16b, v13.16b
290 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
291 tbl v10.16b, {v18.16b},v11.16b
292 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
293 tbl v11.16b, {v18.16b},v12.16b
294 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
295 eor v10.16b, v10.16b, v9.16b
296 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
297 eor v11.16b, v11.16b, v8.16b
298 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
299 cbnz w8, .Lenc_2x_loop
301 // middle of last round
303 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
304 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
305 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
306 tbl v12.16b, {v22.16b}, v10.16b
307 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
308 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
309 tbl v8.16b, {v23.16b}, v11.16b
310 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
311 eor v12.16b, v12.16b, v16.16b
312 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
313 eor v8.16b, v8.16b, v12.16b
314 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
315 tbl v1.16b, {v8.16b},v1.16b
317 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
319 .type _vpaes_decrypt_preheat,%function
321 _vpaes_decrypt_preheat:
325 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
326 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
327 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
328 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
330 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
335 ## Same API as encryption core.
337 .type _vpaes_decrypt_core,%function
341 ldr w8, [x2,#240] // pull rounds
343 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
344 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
345 eor x11, x11, #0x30 // xor $0x30, %r11
347 and x11, x11, #0x30 // and $0x30, %r11
349 adr x10, .Lk_mc_forward+48
351 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
352 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
353 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
354 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
355 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
356 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
357 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
358 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
359 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
365 // Inverse mix columns
367 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
368 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
369 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
370 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
371 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
372 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
373 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
374 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
376 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
377 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
378 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
379 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
380 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
381 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
382 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
384 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
385 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
386 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
387 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
388 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
389 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
390 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
392 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
393 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
394 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
395 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
396 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
397 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
398 sub w8, w8, #1 // sub $1,%rax # nr--
402 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
403 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
404 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
405 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
406 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
407 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
408 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
409 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
410 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
411 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
412 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
413 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
414 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
417 // middle of last round
418 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
419 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
420 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
421 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
422 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
423 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
424 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
425 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
427 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
430 .type vpaes_decrypt,%function
433 .inst 0xd503233f // paciasp
434 stp x29,x30,[sp,#-16]!
438 bl _vpaes_decrypt_preheat
439 bl _vpaes_decrypt_core
443 .inst 0xd50323bf // autiasp
445 .size vpaes_decrypt,.-vpaes_decrypt
447 // v14-v15 input, v0-v1 output
448 .type _vpaes_decrypt_2x,%function
452 ldr w8, [x2,#240] // pull rounds
454 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
455 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
456 eor x11, x11, #0x30 // xor $0x30, %r11
458 and x11, x11, #0x30 // and $0x30, %r11
460 adr x10, .Lk_mc_forward+48
462 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
463 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
464 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
465 and v9.16b, v15.16b, v17.16b
466 ushr v8.16b, v15.16b, #4
467 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
468 tbl v10.16b, {v20.16b},v9.16b
469 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
470 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
471 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
472 tbl v8.16b, {v21.16b},v8.16b
473 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
474 eor v10.16b, v10.16b, v16.16b
475 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
476 eor v8.16b, v8.16b, v10.16b
482 // Inverse mix columns
484 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
485 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
486 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
487 tbl v12.16b, {v24.16b}, v10.16b
488 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
489 tbl v9.16b, {v25.16b}, v11.16b
490 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
491 eor v8.16b, v12.16b, v16.16b
492 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
493 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
494 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
495 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
497 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
498 tbl v12.16b, {v26.16b}, v10.16b
499 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
500 tbl v8.16b, {v8.16b},v5.16b
501 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
502 tbl v9.16b, {v27.16b}, v11.16b
503 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
504 eor v8.16b, v8.16b, v12.16b
505 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
506 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
507 eor v8.16b, v8.16b, v9.16b
508 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
510 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
511 tbl v12.16b, {v28.16b}, v10.16b
512 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
513 tbl v8.16b, {v8.16b},v5.16b
514 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
515 tbl v9.16b, {v29.16b}, v11.16b
516 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
517 eor v8.16b, v8.16b, v12.16b
518 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
519 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
520 eor v8.16b, v8.16b, v9.16b
521 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
523 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
524 tbl v12.16b, {v30.16b}, v10.16b
525 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
526 tbl v8.16b, {v8.16b},v5.16b
527 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
528 tbl v9.16b, {v31.16b}, v11.16b
529 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
530 eor v8.16b, v8.16b, v12.16b
531 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
532 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
533 eor v8.16b, v8.16b, v9.16b
534 sub w8, w8, #1 // sub $1,%rax # nr--
538 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
539 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
540 and v9.16b, v8.16b, v17.16b
541 ushr v8.16b, v8.16b, #4
542 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
543 tbl v10.16b, {v19.16b},v9.16b
544 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
545 eor v9.16b, v9.16b, v8.16b
546 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
547 tbl v11.16b, {v18.16b},v8.16b
548 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
549 tbl v12.16b, {v18.16b},v9.16b
550 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
551 eor v11.16b, v11.16b, v10.16b
552 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
553 eor v12.16b, v12.16b, v10.16b
554 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
555 tbl v10.16b, {v18.16b},v11.16b
556 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
557 tbl v11.16b, {v18.16b},v12.16b
558 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
559 eor v10.16b, v10.16b, v9.16b
560 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
561 eor v11.16b, v11.16b, v8.16b
562 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
563 cbnz w8, .Ldec_2x_loop
565 // middle of last round
566 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
567 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
568 tbl v12.16b, {v22.16b}, v10.16b
569 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
570 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
571 tbl v9.16b, {v23.16b}, v11.16b
572 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
573 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
574 eor v12.16b, v12.16b, v16.16b
575 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
576 eor v8.16b, v9.16b, v12.16b
577 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
578 tbl v1.16b, {v8.16b},v2.16b
580 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
581 ########################################################
583 ## AES key schedule ##
585 ########################################################
586 .type _vpaes_key_preheat,%function
590 movi v16.16b, #0x5b // .Lk_s63
592 movi v17.16b, #0x0f // .Lk_s0F
593 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt
595 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1
596 adr x11, .Lk_mc_forward
597 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
598 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
599 ld1 {v8.2d}, [x10] // .Lk_rcon
600 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
602 .size _vpaes_key_preheat,.-_vpaes_key_preheat
604 .type _vpaes_schedule_core,%function
606 _vpaes_schedule_core:
607 .inst 0xd503233f // paciasp
608 stp x29, x30, [sp,#-16]!
611 bl _vpaes_key_preheat // load the tables
613 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
616 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
617 bl _vpaes_schedule_transform
618 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
620 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
622 cbnz w3, .Lschedule_am_decrypting
624 // encrypting, output zeroth round key after transform
625 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
628 .Lschedule_am_decrypting:
629 // decrypting, output zeroth round key after shiftrows
630 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
631 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
632 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
633 eor x8, x8, #0x30 // xor $0x30, %r8
636 cmp w1, #192 // cmp $192, %esi
644 ## 128-bit specific part of key schedule.
646 ## This schedule is really simple, because all its parts
647 ## are accomplished by the subroutines.
650 mov x0, #10 // mov $10, %esi
653 sub x0, x0, #1 // dec %esi
654 bl _vpaes_schedule_round
655 cbz x0, .Lschedule_mangle_last
656 bl _vpaes_schedule_mangle // write output
662 ## 192-bit specific part of key schedule.
664 ## The main body of this schedule is the same as the 128-bit
665 ## schedule, but with more smearing. The long, high side is
666 ## stored in %xmm7 as before, and the short, low side is in
667 ## the high bits of %xmm6.
669 ## This schedule is somewhat nastier, however, because each
670 ## round produces 192 bits of key material, or 1.5 round keys.
671 ## Therefore, on each cycle we do 2 rounds and produce 3 round
677 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
678 bl _vpaes_schedule_transform // input transform
679 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
680 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
681 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
682 mov x0, #4 // mov $4, %esi
685 sub x0, x0, #1 // dec %esi
686 bl _vpaes_schedule_round
687 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
688 bl _vpaes_schedule_mangle // save key n
689 bl _vpaes_schedule_192_smear
690 bl _vpaes_schedule_mangle // save key n+1
691 bl _vpaes_schedule_round
692 cbz x0, .Lschedule_mangle_last
693 bl _vpaes_schedule_mangle // save key n+2
694 bl _vpaes_schedule_192_smear
700 ## 256-bit specific part of key schedule.
702 ## The structure here is very similar to the 128-bit
703 ## schedule, but with an additional "low side" in
704 ## %xmm6. The low side's rounds are the same as the
705 ## high side's, except no rcon and no rotation.
709 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
710 bl _vpaes_schedule_transform // input transform
711 mov x0, #7 // mov $7, %esi
714 sub x0, x0, #1 // dec %esi
715 bl _vpaes_schedule_mangle // output low result
716 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
719 bl _vpaes_schedule_round
720 cbz x0, .Lschedule_mangle_last
721 bl _vpaes_schedule_mangle
723 // low round. swap xmm7 and xmm6
724 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
726 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
727 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
728 bl _vpaes_schedule_low_round
729 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
734 ## .aes_schedule_mangle_last
736 ## Mangler for last round of key schedule
738 ## when encrypting, outputs out(%xmm0) ^ 63
739 ## when decrypting, outputs unskew(%xmm0)
741 ## Always called right before return... jumps to cleanup and exits
744 .Lschedule_mangle_last:
745 // schedule last round key from xmm0
746 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
747 cbnz w3, .Lschedule_mangle_last_dec
750 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
751 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
752 add x2, x2, #32 // add $32, %rdx
753 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
755 .Lschedule_mangle_last_dec:
756 ld1 {v20.2d,v21.2d}, [x11] // reload constants
757 sub x2, x2, #16 // add $-16, %rdx
758 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
759 bl _vpaes_schedule_transform // output transform
760 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
763 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
764 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
765 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
766 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
767 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
768 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
769 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
770 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
771 ldp x29, x30, [sp],#16
772 .inst 0xd50323bf // autiasp
774 .size _vpaes_schedule_core,.-_vpaes_schedule_core
777 ## .aes_schedule_192_smear
779 ## Smear the short, low side in the 192-bit key schedule.
782 ## %xmm7: high side, b a x y
783 ## %xmm6: low side, d c 0 0
787 ## %xmm6: b+c+d b+c 0 0
788 ## %xmm0: b+c+d b+c b a
790 .type _vpaes_schedule_192_smear,%function
792 _vpaes_schedule_192_smear:
795 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
796 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
797 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
798 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
799 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
800 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
801 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
803 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
806 ## .aes_schedule_round
808 ## Runs one main round of the key schedule on %xmm0, %xmm7
810 ## Specifically, runs subbytes on the high dword of %xmm0
811 ## then rotates it by one byte and xors into the low dword of
814 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
817 ## Smears the dwords of %xmm7 by xoring the low into the
818 ## second low, result into third, result into highest.
820 ## Returns results in %xmm7 = %xmm0.
821 ## Clobbers %xmm1-%xmm4, %r11.
823 .type _vpaes_schedule_round,%function
825 _vpaes_schedule_round:
826 // extract rcon from xmm8
827 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
828 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
829 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
830 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
833 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
834 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
838 // low round: same as high round, but no rotation and no rcon.
839 _vpaes_schedule_low_round:
841 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
842 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
843 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
846 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
847 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
848 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
849 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
850 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
851 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
852 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
853 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
854 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
855 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
856 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
857 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
858 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
859 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
860 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
861 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
862 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
864 // add in smeared stuff
865 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
866 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
868 .size _vpaes_schedule_round,.-_vpaes_schedule_round
871 ## .aes_schedule_transform
873 ## Linear-transform %xmm0 according to tables at (%r11)
875 ## Requires that %xmm9 = 0x0F0F... as in preheat
877 ## Clobbers %xmm1, %xmm2
879 .type _vpaes_schedule_transform,%function
881 _vpaes_schedule_transform:
882 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
883 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
884 // vmovdqa (%r11), %xmm2 # lo
885 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
886 // vmovdqa 16(%r11), %xmm1 # hi
887 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
888 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
890 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
893 ## .aes_schedule_mangle
895 ## Mangle xmm0 from (basis-transformed) standard version
900 ## multiply by circulant 0,1,1,1
901 ## apply shiftrows transform
905 ## multiply by "inverse mixcolumns" circulant E,B,D,9
907 ## apply shiftrows transform
910 ## Writes out to (%rdx), and increments or decrements it
911 ## Keeps track of round number mod 4 in %r8
913 ## Clobbers xmm1-xmm5
915 .type _vpaes_schedule_mangle,%function
917 _vpaes_schedule_mangle:
918 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
919 // vmovdqa .Lk_mc_forward(%rip),%xmm5
920 cbnz w3, .Lschedule_mangle_dec
923 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
924 add x2, x2, #16 // add $16, %rdx
925 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
926 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
927 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
928 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
929 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
930 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
932 b .Lschedule_mangle_both
934 .Lschedule_mangle_dec:
935 // inverse mix columns
936 // lea .Lk_dksd(%rip),%r11
937 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
938 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
940 // vmovdqa 0x00(%r11), %xmm2
941 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
942 // vmovdqa 0x10(%r11), %xmm3
943 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
944 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
945 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
947 // vmovdqa 0x20(%r11), %xmm2
948 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
949 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
950 // vmovdqa 0x30(%r11), %xmm3
951 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
952 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
953 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
955 // vmovdqa 0x40(%r11), %xmm2
956 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
957 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
958 // vmovdqa 0x50(%r11), %xmm3
959 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
960 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
962 // vmovdqa 0x60(%r11), %xmm2
963 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
964 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
965 // vmovdqa 0x70(%r11), %xmm4
966 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
967 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
968 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
969 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
971 sub x2, x2, #16 // add $-16, %rdx
973 .Lschedule_mangle_both:
974 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
975 add x8, x8, #64-16 // add $-16, %r8
976 and x8, x8, #~(1<<6) // and $0x30, %r8
977 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
979 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
981 .globl vpaes_set_encrypt_key
982 .type vpaes_set_encrypt_key,%function
984 vpaes_set_encrypt_key:
985 .inst 0xd503233f // paciasp
986 stp x29,x30,[sp,#-16]!
988 stp d8,d9,[sp,#-16]! // ABI spec says so
990 lsr w9, w1, #5 // shr $5,%eax
991 add w9, w9, #5 // $5,%eax
992 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
994 mov w3, #0 // mov $0,%ecx
995 mov x8, #0x30 // mov $0x30,%r8d
996 bl _vpaes_schedule_core
1000 ldp x29,x30,[sp],#16
1001 .inst 0xd50323bf // autiasp
1003 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1005 .globl vpaes_set_decrypt_key
1006 .type vpaes_set_decrypt_key,%function
1008 vpaes_set_decrypt_key:
1009 .inst 0xd503233f // paciasp
1010 stp x29,x30,[sp,#-16]!
1012 stp d8,d9,[sp,#-16]! // ABI spec says so
1014 lsr w9, w1, #5 // shr $5,%eax
1015 add w9, w9, #5 // $5,%eax
1016 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1017 lsl w9, w9, #4 // shl $4,%eax
1018 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
1021 mov w3, #1 // mov $1,%ecx
1022 lsr w8, w1, #1 // shr $1,%r8d
1023 and x8, x8, #32 // and $32,%r8d
1024 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
1025 bl _vpaes_schedule_core
1028 ldp x29,x30,[sp],#16
1029 .inst 0xd50323bf // autiasp
1031 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1032 .globl vpaes_cbc_encrypt
1033 .type vpaes_cbc_encrypt,%function
1037 cmp w5, #0 // check direction
1038 b.eq vpaes_cbc_decrypt
1040 .inst 0xd503233f // paciasp
1041 stp x29,x30,[sp,#-16]!
1044 mov x17, x2 // reassign
1045 mov x2, x3 // reassign
1047 ld1 {v0.16b}, [x4] // load ivec
1048 bl _vpaes_encrypt_preheat
1053 ld1 {v7.16b}, [x0],#16 // load input
1054 eor v7.16b, v7.16b, v0.16b // xor with ivec
1055 bl _vpaes_encrypt_core
1056 st1 {v0.16b}, [x1],#16 // save output
1060 st1 {v0.16b}, [x4] // write ivec
1062 ldp x29,x30,[sp],#16
1063 .inst 0xd50323bf // autiasp
1066 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1068 .type vpaes_cbc_decrypt,%function
1071 .inst 0xd503233f // paciasp
1072 stp x29,x30,[sp,#-16]!
1074 stp d8,d9,[sp,#-16]! // ABI spec says so
1075 stp d10,d11,[sp,#-16]!
1076 stp d12,d13,[sp,#-16]!
1077 stp d14,d15,[sp,#-16]!
1079 mov x17, x2 // reassign
1080 mov x2, x3 // reassign
1081 ld1 {v6.16b}, [x4] // load ivec
1082 bl _vpaes_decrypt_preheat
1084 b.eq .Lcbc_dec_loop2x
1086 ld1 {v7.16b}, [x0], #16 // load input
1087 bl _vpaes_decrypt_core
1088 eor v0.16b, v0.16b, v6.16b // xor with ivec
1089 orr v6.16b, v7.16b, v7.16b // next ivec value
1090 st1 {v0.16b}, [x1], #16
1096 ld1 {v14.16b,v15.16b}, [x0], #32
1097 bl _vpaes_decrypt_2x
1098 eor v0.16b, v0.16b, v6.16b // xor with ivec
1099 eor v1.16b, v1.16b, v14.16b
1100 orr v6.16b, v15.16b, v15.16b
1101 st1 {v0.16b,v1.16b}, [x1], #32
1103 b.hi .Lcbc_dec_loop2x
1108 ldp d14,d15,[sp],#16
1109 ldp d12,d13,[sp],#16
1110 ldp d10,d11,[sp],#16
1112 ldp x29,x30,[sp],#16
1113 .inst 0xd50323bf // autiasp
1115 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1116 .globl vpaes_ecb_encrypt
1117 .type vpaes_ecb_encrypt,%function
1120 .inst 0xd503233f // paciasp
1121 stp x29,x30,[sp,#-16]!
1123 stp d8,d9,[sp,#-16]! // ABI spec says so
1124 stp d10,d11,[sp,#-16]!
1125 stp d12,d13,[sp,#-16]!
1126 stp d14,d15,[sp,#-16]!
1130 bl _vpaes_encrypt_preheat
1134 ld1 {v7.16b}, [x0],#16
1135 bl _vpaes_encrypt_core
1136 st1 {v0.16b}, [x1],#16
1142 ld1 {v14.16b,v15.16b}, [x0], #32
1143 bl _vpaes_encrypt_2x
1144 st1 {v0.16b,v1.16b}, [x1], #32
1149 ldp d14,d15,[sp],#16
1150 ldp d12,d13,[sp],#16
1151 ldp d10,d11,[sp],#16
1153 ldp x29,x30,[sp],#16
1154 .inst 0xd50323bf // autiasp
1156 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1158 .globl vpaes_ecb_decrypt
1159 .type vpaes_ecb_decrypt,%function
1162 .inst 0xd503233f // paciasp
1163 stp x29,x30,[sp,#-16]!
1165 stp d8,d9,[sp,#-16]! // ABI spec says so
1166 stp d10,d11,[sp,#-16]!
1167 stp d12,d13,[sp,#-16]!
1168 stp d14,d15,[sp,#-16]!
1172 bl _vpaes_decrypt_preheat
1176 ld1 {v7.16b}, [x0],#16
1177 bl _vpaes_encrypt_core
1178 st1 {v0.16b}, [x1],#16
1184 ld1 {v14.16b,v15.16b}, [x0], #32
1185 bl _vpaes_decrypt_2x
1186 st1 {v0.16b,v1.16b}, [x1], #32
1191 ldp d14,d15,[sp],#16
1192 ldp d12,d13,[sp],#16
1193 ldp d10,d11,[sp],#16
1195 ldp x29,x30,[sp],#16
1196 .inst 0xd50323bf // autiasp
1198 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt