2 /* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
5 .type _vpaes_consts,%object
6 .align 7 // totally strategic alignment
8 .Lk_mc_forward: // mc_forward
9 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
10 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
11 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
12 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
13 .Lk_mc_backward: // mc_backward
14 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
15 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
16 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
17 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
19 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
20 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
21 .quad 0x0F060D040B020900, 0x070E050C030A0108
22 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
28 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
29 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
30 .Lk_ipt: // input transform (lo, hi)
31 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
32 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
33 .Lk_sbo: // sbou, sbot
34 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
35 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
36 .Lk_sb1: // sb1u, sb1t
37 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
38 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
39 .Lk_sb2: // sb2u, sb2t
40 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
41 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
46 .Lk_dipt: // decryption input transform
47 .quad 0x0F505B040B545F00, 0x154A411E114E451A
48 .quad 0x86E383E660056500, 0x12771772F491F194
49 .Lk_dsbo: // decryption sbox final output
50 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
51 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
52 .Lk_dsb9: // decryption sbox output *9*u, *9*t
53 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
54 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
55 .Lk_dsbd: // decryption sbox output *D*u, *D*t
56 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
57 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
58 .Lk_dsbb: // decryption sbox output *B*u, *B*t
59 .quad 0xD022649296B44200, 0x602646F6B0F2D404
60 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
61 .Lk_dsbe: // decryption sbox output *E*u, *E*t
62 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
63 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
66 // Key schedule constants
68 .Lk_dksd: // decryption key schedule: invskew x*D
69 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
70 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
71 .Lk_dksb: // decryption key schedule: invskew x*B
72 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
73 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
74 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
75 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
76 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
77 .Lk_dks9: // decryption key schedule: invskew x*9
78 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
79 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
82 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
84 .Lk_opt: // output transform
85 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
86 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
87 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
88 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
89 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
91 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
93 .size _vpaes_consts,.-_vpaes_consts
98 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
99 ## and %xmm9-%xmm15 as specified below.
101 .type _vpaes_encrypt_preheat,%function
103 _vpaes_encrypt_preheat:
106 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
107 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
108 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
110 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
115 ## AES-encrypt %xmm0.
119 ## %xmm9-%xmm15 as in _vpaes_preheat
120 ## (%rdx) = scheduled keys
123 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
124 ## Preserves %xmm6 - %xmm8 so you get some local vectors
127 .type _vpaes_encrypt_core,%function
131 ldr w8, [x2,#240] // pull rounds
132 adr x11, .Lk_mc_forward+16
133 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
134 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
135 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
136 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
137 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
138 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
139 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
140 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
141 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
146 // middle of middle round
148 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
149 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
150 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
151 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
152 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
153 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
154 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
155 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
156 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
157 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
158 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
159 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
160 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
161 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
162 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
163 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
164 sub w8, w8, #1 // nr--
168 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
169 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
170 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
171 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
172 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
173 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
174 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
175 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
176 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
177 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
178 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
179 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
180 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
183 // middle of last round
185 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
186 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
187 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
188 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
189 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
190 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
191 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
192 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
194 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
197 .type vpaes_encrypt,%function
200 stp x29,x30,[sp,#-16]!
204 bl _vpaes_encrypt_preheat
205 bl _vpaes_encrypt_core
210 .size vpaes_encrypt,.-vpaes_encrypt
212 .type _vpaes_encrypt_2x,%function
216 ldr w8, [x2,#240] // pull rounds
217 adr x11, .Lk_mc_forward+16
218 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
219 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
220 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
221 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
222 and v9.16b, v15.16b, v17.16b
223 ushr v8.16b, v15.16b, #4
224 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
225 tbl v9.16b, {v20.16b}, v9.16b
226 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
227 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
228 tbl v10.16b, {v21.16b}, v8.16b
229 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
230 eor v8.16b, v9.16b, v16.16b
231 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
232 eor v8.16b, v8.16b, v10.16b
237 // middle of middle round
239 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
240 tbl v12.16b, {v25.16b}, v10.16b
241 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
242 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
243 tbl v8.16b, {v24.16b}, v11.16b
244 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
245 eor v12.16b, v12.16b, v16.16b
246 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
247 tbl v13.16b, {v27.16b}, v10.16b
248 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
249 eor v8.16b, v8.16b, v12.16b
250 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
251 tbl v10.16b, {v26.16b}, v11.16b
252 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
253 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
254 tbl v11.16b, {v8.16b}, v1.16b
255 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
256 eor v10.16b, v10.16b, v13.16b
257 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
258 tbl v8.16b, {v8.16b}, v4.16b
259 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
260 eor v11.16b, v11.16b, v10.16b
261 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
262 tbl v12.16b, {v11.16b},v1.16b
263 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
264 eor v8.16b, v8.16b, v11.16b
265 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
266 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
267 eor v8.16b, v8.16b, v12.16b
268 sub w8, w8, #1 // nr--
272 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
273 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
274 and v9.16b, v8.16b, v17.16b
275 ushr v8.16b, v8.16b, #4
276 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
277 tbl v13.16b, {v19.16b},v9.16b
278 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
279 eor v9.16b, v9.16b, v8.16b
280 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
281 tbl v11.16b, {v18.16b},v8.16b
282 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
283 tbl v12.16b, {v18.16b},v9.16b
284 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
285 eor v11.16b, v11.16b, v13.16b
286 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
287 eor v12.16b, v12.16b, v13.16b
288 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
289 tbl v10.16b, {v18.16b},v11.16b
290 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
291 tbl v11.16b, {v18.16b},v12.16b
292 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
293 eor v10.16b, v10.16b, v9.16b
294 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
295 eor v11.16b, v11.16b, v8.16b
296 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
297 cbnz w8, .Lenc_2x_loop
299 // middle of last round
301 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
302 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
303 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
304 tbl v12.16b, {v22.16b}, v10.16b
305 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
306 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
307 tbl v8.16b, {v23.16b}, v11.16b
308 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
309 eor v12.16b, v12.16b, v16.16b
310 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
311 eor v8.16b, v8.16b, v12.16b
312 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
313 tbl v1.16b, {v8.16b},v1.16b
315 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
317 .type _vpaes_decrypt_preheat,%function
319 _vpaes_decrypt_preheat:
323 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
324 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
325 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
326 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
328 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
333 ## Same API as encryption core.
335 .type _vpaes_decrypt_core,%function
339 ldr w8, [x2,#240] // pull rounds
341 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
342 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
343 eor x11, x11, #0x30 // xor $0x30, %r11
345 and x11, x11, #0x30 // and $0x30, %r11
347 adr x10, .Lk_mc_forward+48
349 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
350 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
351 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
352 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
353 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
354 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
355 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
356 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
357 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
363 // Inverse mix columns
365 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
366 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
367 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
368 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
369 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
370 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
371 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
372 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
374 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
375 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
376 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
377 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
378 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
379 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
380 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
382 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
383 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
384 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
385 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
386 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
387 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
388 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
390 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
391 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
392 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
393 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
394 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
395 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
396 sub w8, w8, #1 // sub $1,%rax # nr--
400 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
401 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
402 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
403 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
404 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
405 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
406 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
407 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
408 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
409 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
410 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
411 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
412 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
415 // middle of last round
416 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
417 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
418 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
419 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
420 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
421 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
422 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
423 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
425 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
428 .type vpaes_decrypt,%function
431 stp x29,x30,[sp,#-16]!
435 bl _vpaes_decrypt_preheat
436 bl _vpaes_decrypt_core
441 .size vpaes_decrypt,.-vpaes_decrypt
443 // v14-v15 input, v0-v1 output
444 .type _vpaes_decrypt_2x,%function
448 ldr w8, [x2,#240] // pull rounds
450 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
451 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
452 eor x11, x11, #0x30 // xor $0x30, %r11
454 and x11, x11, #0x30 // and $0x30, %r11
456 adr x10, .Lk_mc_forward+48
458 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
459 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
460 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
461 and v9.16b, v15.16b, v17.16b
462 ushr v8.16b, v15.16b, #4
463 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
464 tbl v10.16b, {v20.16b},v9.16b
465 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
466 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
467 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
468 tbl v8.16b, {v21.16b},v8.16b
469 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
470 eor v10.16b, v10.16b, v16.16b
471 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
472 eor v8.16b, v8.16b, v10.16b
478 // Inverse mix columns
480 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
481 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
482 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
483 tbl v12.16b, {v24.16b}, v10.16b
484 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
485 tbl v9.16b, {v25.16b}, v11.16b
486 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
487 eor v8.16b, v12.16b, v16.16b
488 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
489 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
490 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
491 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
493 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
494 tbl v12.16b, {v26.16b}, v10.16b
495 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
496 tbl v8.16b, {v8.16b},v5.16b
497 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
498 tbl v9.16b, {v27.16b}, v11.16b
499 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
500 eor v8.16b, v8.16b, v12.16b
501 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
502 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
503 eor v8.16b, v8.16b, v9.16b
504 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
506 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
507 tbl v12.16b, {v28.16b}, v10.16b
508 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
509 tbl v8.16b, {v8.16b},v5.16b
510 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
511 tbl v9.16b, {v29.16b}, v11.16b
512 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
513 eor v8.16b, v8.16b, v12.16b
514 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
515 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
516 eor v8.16b, v8.16b, v9.16b
517 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
519 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
520 tbl v12.16b, {v30.16b}, v10.16b
521 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
522 tbl v8.16b, {v8.16b},v5.16b
523 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
524 tbl v9.16b, {v31.16b}, v11.16b
525 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
526 eor v8.16b, v8.16b, v12.16b
527 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
528 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
529 eor v8.16b, v8.16b, v9.16b
530 sub w8, w8, #1 // sub $1,%rax # nr--
534 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
535 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
536 and v9.16b, v8.16b, v17.16b
537 ushr v8.16b, v8.16b, #4
538 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
539 tbl v10.16b, {v19.16b},v9.16b
540 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
541 eor v9.16b, v9.16b, v8.16b
542 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
543 tbl v11.16b, {v18.16b},v8.16b
544 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
545 tbl v12.16b, {v18.16b},v9.16b
546 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
547 eor v11.16b, v11.16b, v10.16b
548 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
549 eor v12.16b, v12.16b, v10.16b
550 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
551 tbl v10.16b, {v18.16b},v11.16b
552 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
553 tbl v11.16b, {v18.16b},v12.16b
554 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
555 eor v10.16b, v10.16b, v9.16b
556 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
557 eor v11.16b, v11.16b, v8.16b
558 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
559 cbnz w8, .Ldec_2x_loop
561 // middle of last round
562 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
563 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
564 tbl v12.16b, {v22.16b}, v10.16b
565 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
566 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
567 tbl v9.16b, {v23.16b}, v11.16b
568 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
569 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
570 eor v12.16b, v12.16b, v16.16b
571 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
572 eor v8.16b, v9.16b, v12.16b
573 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
574 tbl v1.16b, {v8.16b},v2.16b
576 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
577 ########################################################
579 ## AES key schedule ##
581 ########################################################
582 .type _vpaes_key_preheat,%function
586 movi v16.16b, #0x5b // .Lk_s63
588 movi v17.16b, #0x0f // .Lk_s0F
589 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt
591 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1
592 adr x11, .Lk_mc_forward
593 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
594 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
595 ld1 {v8.2d}, [x10] // .Lk_rcon
596 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
598 .size _vpaes_key_preheat,.-_vpaes_key_preheat
600 .type _vpaes_schedule_core,%function
602 _vpaes_schedule_core:
603 stp x29, x30, [sp,#-16]!
606 bl _vpaes_key_preheat // load the tables
608 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
611 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
612 bl _vpaes_schedule_transform
613 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
615 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
617 cbnz w3, .Lschedule_am_decrypting
619 // encrypting, output zeroth round key after transform
620 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
623 .Lschedule_am_decrypting:
624 // decrypting, output zeroth round key after shiftrows
625 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
626 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
627 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
628 eor x8, x8, #0x30 // xor $0x30, %r8
631 cmp w1, #192 // cmp $192, %esi
639 ## 128-bit specific part of key schedule.
641 ## This schedule is really simple, because all its parts
642 ## are accomplished by the subroutines.
645 mov x0, #10 // mov $10, %esi
648 sub x0, x0, #1 // dec %esi
649 bl _vpaes_schedule_round
650 cbz x0, .Lschedule_mangle_last
651 bl _vpaes_schedule_mangle // write output
657 ## 192-bit specific part of key schedule.
659 ## The main body of this schedule is the same as the 128-bit
660 ## schedule, but with more smearing. The long, high side is
661 ## stored in %xmm7 as before, and the short, low side is in
662 ## the high bits of %xmm6.
664 ## This schedule is somewhat nastier, however, because each
665 ## round produces 192 bits of key material, or 1.5 round keys.
666 ## Therefore, on each cycle we do 2 rounds and produce 3 round
672 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
673 bl _vpaes_schedule_transform // input transform
674 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
675 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
676 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
677 mov x0, #4 // mov $4, %esi
680 sub x0, x0, #1 // dec %esi
681 bl _vpaes_schedule_round
682 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
683 bl _vpaes_schedule_mangle // save key n
684 bl _vpaes_schedule_192_smear
685 bl _vpaes_schedule_mangle // save key n+1
686 bl _vpaes_schedule_round
687 cbz x0, .Lschedule_mangle_last
688 bl _vpaes_schedule_mangle // save key n+2
689 bl _vpaes_schedule_192_smear
695 ## 256-bit specific part of key schedule.
697 ## The structure here is very similar to the 128-bit
698 ## schedule, but with an additional "low side" in
699 ## %xmm6. The low side's rounds are the same as the
700 ## high side's, except no rcon and no rotation.
704 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
705 bl _vpaes_schedule_transform // input transform
706 mov x0, #7 // mov $7, %esi
709 sub x0, x0, #1 // dec %esi
710 bl _vpaes_schedule_mangle // output low result
711 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
714 bl _vpaes_schedule_round
715 cbz x0, .Lschedule_mangle_last
716 bl _vpaes_schedule_mangle
718 // low round. swap xmm7 and xmm6
719 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
721 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
722 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
723 bl _vpaes_schedule_low_round
724 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
729 ## .aes_schedule_mangle_last
731 ## Mangler for last round of key schedule
733 ## when encrypting, outputs out(%xmm0) ^ 63
734 ## when decrypting, outputs unskew(%xmm0)
736 ## Always called right before return... jumps to cleanup and exits
739 .Lschedule_mangle_last:
740 // schedule last round key from xmm0
741 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
742 cbnz w3, .Lschedule_mangle_last_dec
745 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
746 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
747 add x2, x2, #32 // add $32, %rdx
748 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
750 .Lschedule_mangle_last_dec:
751 ld1 {v20.2d,v21.2d}, [x11] // reload constants
752 sub x2, x2, #16 // add $-16, %rdx
753 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
754 bl _vpaes_schedule_transform // output transform
755 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
758 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
759 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
760 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
761 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
762 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
763 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
764 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
765 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
766 ldp x29, x30, [sp],#16
768 .size _vpaes_schedule_core,.-_vpaes_schedule_core
771 ## .aes_schedule_192_smear
773 ## Smear the short, low side in the 192-bit key schedule.
776 ## %xmm7: high side, b a x y
777 ## %xmm6: low side, d c 0 0
781 ## %xmm6: b+c+d b+c 0 0
782 ## %xmm0: b+c+d b+c b a
784 .type _vpaes_schedule_192_smear,%function
786 _vpaes_schedule_192_smear:
789 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
790 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
791 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
792 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
793 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
794 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
795 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
797 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
800 ## .aes_schedule_round
802 ## Runs one main round of the key schedule on %xmm0, %xmm7
804 ## Specifically, runs subbytes on the high dword of %xmm0
805 ## then rotates it by one byte and xors into the low dword of
808 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
811 ## Smears the dwords of %xmm7 by xoring the low into the
812 ## second low, result into third, result into highest.
814 ## Returns results in %xmm7 = %xmm0.
815 ## Clobbers %xmm1-%xmm4, %r11.
817 .type _vpaes_schedule_round,%function
819 _vpaes_schedule_round:
820 // extract rcon from xmm8
821 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
822 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
823 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
824 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
827 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
828 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
832 // low round: same as high round, but no rotation and no rcon.
833 _vpaes_schedule_low_round:
835 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
836 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
837 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
840 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
841 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
842 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
843 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
844 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
845 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
846 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
847 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
848 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
849 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
850 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
851 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
852 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
853 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
854 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
855 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
856 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
858 // add in smeared stuff
859 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
860 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
862 .size _vpaes_schedule_round,.-_vpaes_schedule_round
865 ## .aes_schedule_transform
867 ## Linear-transform %xmm0 according to tables at (%r11)
869 ## Requires that %xmm9 = 0x0F0F... as in preheat
871 ## Clobbers %xmm1, %xmm2
873 .type _vpaes_schedule_transform,%function
875 _vpaes_schedule_transform:
876 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
877 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
878 // vmovdqa (%r11), %xmm2 # lo
879 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
880 // vmovdqa 16(%r11), %xmm1 # hi
881 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
882 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
884 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
887 ## .aes_schedule_mangle
889 ## Mangle xmm0 from (basis-transformed) standard version
894 ## multiply by circulant 0,1,1,1
895 ## apply shiftrows transform
899 ## multiply by "inverse mixcolumns" circulant E,B,D,9
901 ## apply shiftrows transform
904 ## Writes out to (%rdx), and increments or decrements it
905 ## Keeps track of round number mod 4 in %r8
907 ## Clobbers xmm1-xmm5
909 .type _vpaes_schedule_mangle,%function
911 _vpaes_schedule_mangle:
912 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
913 // vmovdqa .Lk_mc_forward(%rip),%xmm5
914 cbnz w3, .Lschedule_mangle_dec
917 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
918 add x2, x2, #16 // add $16, %rdx
919 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
920 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
921 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
922 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
923 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
924 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
926 b .Lschedule_mangle_both
928 .Lschedule_mangle_dec:
929 // inverse mix columns
930 // lea .Lk_dksd(%rip),%r11
931 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
932 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
934 // vmovdqa 0x00(%r11), %xmm2
935 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
936 // vmovdqa 0x10(%r11), %xmm3
937 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
938 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
939 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
941 // vmovdqa 0x20(%r11), %xmm2
942 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
943 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
944 // vmovdqa 0x30(%r11), %xmm3
945 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
946 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
947 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
949 // vmovdqa 0x40(%r11), %xmm2
950 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
951 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
952 // vmovdqa 0x50(%r11), %xmm3
953 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
954 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
956 // vmovdqa 0x60(%r11), %xmm2
957 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
958 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
959 // vmovdqa 0x70(%r11), %xmm4
960 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
961 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
962 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
963 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
965 sub x2, x2, #16 // add $-16, %rdx
967 .Lschedule_mangle_both:
968 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
969 add x8, x8, #64-16 // add $-16, %r8
970 and x8, x8, #~(1<<6) // and $0x30, %r8
971 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
973 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
975 .globl vpaes_set_encrypt_key
976 .type vpaes_set_encrypt_key,%function
978 vpaes_set_encrypt_key:
979 stp x29,x30,[sp,#-16]!
981 stp d8,d9,[sp,#-16]! // ABI spec says so
983 lsr w9, w1, #5 // shr $5,%eax
984 add w9, w9, #5 // $5,%eax
985 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
987 mov w3, #0 // mov $0,%ecx
988 mov x8, #0x30 // mov $0x30,%r8d
989 bl _vpaes_schedule_core
995 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
997 .globl vpaes_set_decrypt_key
998 .type vpaes_set_decrypt_key,%function
1000 vpaes_set_decrypt_key:
1001 stp x29,x30,[sp,#-16]!
1003 stp d8,d9,[sp,#-16]! // ABI spec says so
1005 lsr w9, w1, #5 // shr $5,%eax
1006 add w9, w9, #5 // $5,%eax
1007 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1008 lsl w9, w9, #4 // shl $4,%eax
1009 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
1012 mov w3, #1 // mov $1,%ecx
1013 lsr w8, w1, #1 // shr $1,%r8d
1014 and x8, x8, #32 // and $32,%r8d
1015 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
1016 bl _vpaes_schedule_core
1019 ldp x29,x30,[sp],#16
1021 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1022 .globl vpaes_cbc_encrypt
1023 .type vpaes_cbc_encrypt,%function
1027 cmp w5, #0 // check direction
1028 b.eq vpaes_cbc_decrypt
1030 stp x29,x30,[sp,#-16]!
1033 mov x17, x2 // reassign
1034 mov x2, x3 // reassign
1036 ld1 {v0.16b}, [x4] // load ivec
1037 bl _vpaes_encrypt_preheat
1042 ld1 {v7.16b}, [x0],#16 // load input
1043 eor v7.16b, v7.16b, v0.16b // xor with ivec
1044 bl _vpaes_encrypt_core
1045 st1 {v0.16b}, [x1],#16 // save output
1049 st1 {v0.16b}, [x4] // write ivec
1051 ldp x29,x30,[sp],#16
1054 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1056 .type vpaes_cbc_decrypt,%function
1059 stp x29,x30,[sp,#-16]!
1061 stp d8,d9,[sp,#-16]! // ABI spec says so
1062 stp d10,d11,[sp,#-16]!
1063 stp d12,d13,[sp,#-16]!
1064 stp d14,d15,[sp,#-16]!
1066 mov x17, x2 // reassign
1067 mov x2, x3 // reassign
1068 ld1 {v6.16b}, [x4] // load ivec
1069 bl _vpaes_decrypt_preheat
1071 b.eq .Lcbc_dec_loop2x
1073 ld1 {v7.16b}, [x0], #16 // load input
1074 bl _vpaes_decrypt_core
1075 eor v0.16b, v0.16b, v6.16b // xor with ivec
1076 orr v6.16b, v7.16b, v7.16b // next ivec value
1077 st1 {v0.16b}, [x1], #16
1083 ld1 {v14.16b,v15.16b}, [x0], #32
1084 bl _vpaes_decrypt_2x
1085 eor v0.16b, v0.16b, v6.16b // xor with ivec
1086 eor v1.16b, v1.16b, v14.16b
1087 orr v6.16b, v15.16b, v15.16b
1088 st1 {v0.16b,v1.16b}, [x1], #32
1090 b.hi .Lcbc_dec_loop2x
1095 ldp d14,d15,[sp],#16
1096 ldp d12,d13,[sp],#16
1097 ldp d10,d11,[sp],#16
1099 ldp x29,x30,[sp],#16
1101 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1102 .globl vpaes_ecb_encrypt
1103 .type vpaes_ecb_encrypt,%function
1106 stp x29,x30,[sp,#-16]!
1108 stp d8,d9,[sp,#-16]! // ABI spec says so
1109 stp d10,d11,[sp,#-16]!
1110 stp d12,d13,[sp,#-16]!
1111 stp d14,d15,[sp,#-16]!
1115 bl _vpaes_encrypt_preheat
1119 ld1 {v7.16b}, [x0],#16
1120 bl _vpaes_encrypt_core
1121 st1 {v0.16b}, [x1],#16
1127 ld1 {v14.16b,v15.16b}, [x0], #32
1128 bl _vpaes_encrypt_2x
1129 st1 {v0.16b,v1.16b}, [x1], #32
1134 ldp d14,d15,[sp],#16
1135 ldp d12,d13,[sp],#16
1136 ldp d10,d11,[sp],#16
1138 ldp x29,x30,[sp],#16
1140 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1142 .globl vpaes_ecb_decrypt
1143 .type vpaes_ecb_decrypt,%function
1146 stp x29,x30,[sp,#-16]!
1148 stp d8,d9,[sp,#-16]! // ABI spec says so
1149 stp d10,d11,[sp,#-16]!
1150 stp d12,d13,[sp,#-16]!
1151 stp d14,d15,[sp,#-16]!
1155 bl _vpaes_decrypt_preheat
1159 ld1 {v7.16b}, [x0],#16
1160 bl _vpaes_encrypt_core
1161 st1 {v0.16b}, [x1],#16
1167 ld1 {v14.16b,v15.16b}, [x0], #32
1168 bl _vpaes_decrypt_2x
1169 st1 {v0.16b,v1.16b}, [x1], #32
1174 ldp d14,d15,[sp],#16
1175 ldp d12,d13,[sp],#16
1176 ldp d10,d11,[sp],#16
1178 ldp x29,x30,[sp],#16
1180 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt