]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/aarch64/vpaes-armv8.S
Regen assemply files for aarch64.
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / aarch64 / vpaes-armv8.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
3 .text
4
5 .type   _vpaes_consts,%object
6 .align  7       // totally strategic alignment
7 _vpaes_consts:
8 .Lk_mc_forward: //      mc_forward
9 .quad   0x0407060500030201, 0x0C0F0E0D080B0A09
10 .quad   0x080B0A0904070605, 0x000302010C0F0E0D
11 .quad   0x0C0F0E0D080B0A09, 0x0407060500030201
12 .quad   0x000302010C0F0E0D, 0x080B0A0904070605
13 .Lk_mc_backward:        //      mc_backward
14 .quad   0x0605040702010003, 0x0E0D0C0F0A09080B
15 .quad   0x020100030E0D0C0F, 0x0A09080B06050407
16 .quad   0x0E0D0C0F0A09080B, 0x0605040702010003
17 .quad   0x0A09080B06050407, 0x020100030E0D0C0F
18 .Lk_sr: //      sr
19 .quad   0x0706050403020100, 0x0F0E0D0C0B0A0908
20 .quad   0x030E09040F0A0500, 0x0B06010C07020D08
21 .quad   0x0F060D040B020900, 0x070E050C030A0108
22 .quad   0x0B0E0104070A0D00, 0x0306090C0F020508
23
24 //
25 // "Hot" constants
26 //
27 .Lk_inv:        //      inv, inva
28 .quad   0x0E05060F0D080180, 0x040703090A0B0C02
29 .quad   0x01040A060F0B0780, 0x030D0E0C02050809
30 .Lk_ipt:        //      input transform (lo, hi)
31 .quad   0xC2B2E8985A2A7000, 0xCABAE09052227808
32 .quad   0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
33 .Lk_sbo:        //      sbou, sbot
34 .quad   0xD0D26D176FBDC700, 0x15AABF7AC502A878
35 .quad   0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
36 .Lk_sb1:        //      sb1u, sb1t
37 .quad   0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
38 .quad   0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
39 .Lk_sb2:        //      sb2u, sb2t
40 .quad   0x69EB88400AE12900, 0xC2A163C8AB82234A
41 .quad   0xE27A93C60B712400, 0x5EB7E955BC982FCD
42
43 //
44 //  Decryption stuff
45 //
46 .Lk_dipt:       //      decryption input transform
47 .quad   0x0F505B040B545F00, 0x154A411E114E451A
48 .quad   0x86E383E660056500, 0x12771772F491F194
49 .Lk_dsbo:       //      decryption sbox final output
50 .quad   0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
51 .quad   0x12D7560F93441D00, 0xCA4B8159D8C58E9C
52 .Lk_dsb9:       //      decryption sbox output *9*u, *9*t
53 .quad   0x851C03539A86D600, 0xCAD51F504F994CC9
54 .quad   0xC03B1789ECD74900, 0x725E2C9EB2FBA565
55 .Lk_dsbd:       //      decryption sbox output *D*u, *D*t
56 .quad   0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
57 .quad   0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
58 .Lk_dsbb:       //      decryption sbox output *B*u, *B*t
59 .quad   0xD022649296B44200, 0x602646F6B0F2D404
60 .quad   0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
61 .Lk_dsbe:       //      decryption sbox output *E*u, *E*t
62 .quad   0x46F2929626D4D000, 0x2242600464B4F6B0
63 .quad   0x0C55A6CDFFAAC100, 0x9467F36B98593E32
64
65 //
66 //  Key schedule constants
67 //
68 .Lk_dksd:       //      decryption key schedule: invskew x*D
69 .quad   0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
70 .quad   0x41C277F4B5368300, 0x5FDC69EAAB289D1E
71 .Lk_dksb:       //      decryption key schedule: invskew x*B
72 .quad   0x9A4FCA1F8550D500, 0x03D653861CC94C99
73 .quad   0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
74 .Lk_dkse:       //      decryption key schedule: invskew x*E + 0x63
75 .quad   0xD5031CCA1FC9D600, 0x53859A4C994F5086
76 .quad   0xA23196054FDC7BE8, 0xCD5EF96A20B31487
77 .Lk_dks9:       //      decryption key schedule: invskew x*9
78 .quad   0xB6116FC87ED9A700, 0x4AED933482255BFC
79 .quad   0x4576516227143300, 0x8BB89FACE9DAFDCE
80
81 .Lk_rcon:       //      rcon
82 .quad   0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
83
84 .Lk_opt:        //      output transform
85 .quad   0xFF9F4929D6B66000, 0xF7974121DEBE6808
86 .quad   0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
87 .Lk_deskew:     //      deskew tables: inverts the sbox's "skew"
88 .quad   0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
89 .quad   0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
90
91 .byte   86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
92 .align  2
93 .size   _vpaes_consts,.-_vpaes_consts
94 .align  6
95 ##
96 ##  _aes_preheat
97 ##
98 ##  Fills register %r10 -> .aes_consts (so you can -fPIC)
99 ##  and %xmm9-%xmm15 as specified below.
100 ##
101 .type   _vpaes_encrypt_preheat,%function
102 .align  4
103 _vpaes_encrypt_preheat:
104         adr     x10, .Lk_inv
105         movi    v17.16b, #0x0f
106         ld1     {v18.2d,v19.2d}, [x10],#32      // .Lk_inv
107         ld1     {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64        // .Lk_ipt, .Lk_sbo
108         ld1     {v24.2d,v25.2d,v26.2d,v27.2d}, [x10]            // .Lk_sb1, .Lk_sb2
109         ret
110 .size   _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
111
112 ##
113 ##  _aes_encrypt_core
114 ##
115 ##  AES-encrypt %xmm0.
116 ##
117 ##  Inputs:
118 ##     %xmm0 = input
119 ##     %xmm9-%xmm15 as in _vpaes_preheat
120 ##    (%rdx) = scheduled keys
121 ##
122 ##  Output in %xmm0
123 ##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
124 ##  Preserves %xmm6 - %xmm8 so you get some local vectors
125 ##
126 ##
127 .type   _vpaes_encrypt_core,%function
128 .align  4
129 _vpaes_encrypt_core:
130         mov     x9, x2
131         ldr     w8, [x2,#240]                   // pull rounds
132         adr     x11, .Lk_mc_forward+16
133                                                 // vmovdqa      .Lk_ipt(%rip),  %xmm2   # iptlo
134         ld1     {v16.2d}, [x9], #16             // vmovdqu      (%r9),  %xmm5           # round0 key
135         and     v1.16b, v7.16b, v17.16b         // vpand        %xmm9,  %xmm0,  %xmm1
136         ushr    v0.16b, v7.16b, #4              // vpsrlb       $4,     %xmm0,  %xmm0
137         tbl     v1.16b, {v20.16b}, v1.16b       // vpshufb      %xmm1,  %xmm2,  %xmm1
138                                                 // vmovdqa      .Lk_ipt+16(%rip), %xmm3 # ipthi
139         tbl     v2.16b, {v21.16b}, v0.16b       // vpshufb      %xmm0,  %xmm3,  %xmm2
140         eor     v0.16b, v1.16b, v16.16b         // vpxor        %xmm5,  %xmm1,  %xmm0
141         eor     v0.16b, v0.16b, v2.16b          // vpxor        %xmm2,  %xmm0,  %xmm0
142         b       .Lenc_entry
143
144 .align  4
145 .Lenc_loop:
146         // middle of middle round
147         add     x10, x11, #0x40
148         tbl     v4.16b, {v25.16b}, v2.16b               // vpshufb      %xmm2,  %xmm13, %xmm4   # 4 = sb1u
149         ld1     {v1.2d}, [x11], #16             // vmovdqa      -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
150         tbl     v0.16b, {v24.16b}, v3.16b               // vpshufb      %xmm3,  %xmm12, %xmm0   # 0 = sb1t
151         eor     v4.16b, v4.16b, v16.16b         // vpxor        %xmm5,  %xmm4,  %xmm4   # 4 = sb1u + k
152         tbl     v5.16b, {v27.16b}, v2.16b               // vpshufb      %xmm2,  %xmm15, %xmm5   # 4 = sb2u
153         eor     v0.16b, v0.16b, v4.16b          // vpxor        %xmm4,  %xmm0,  %xmm0   # 0 = A
154         tbl     v2.16b, {v26.16b}, v3.16b               // vpshufb      %xmm3,  %xmm14, %xmm2   # 2 = sb2t
155         ld1     {v4.2d}, [x10]                  // vmovdqa      (%r11,%r10), %xmm4      # .Lk_mc_backward[]
156         tbl     v3.16b, {v0.16b}, v1.16b        // vpshufb      %xmm1,  %xmm0,  %xmm3   # 0 = B
157         eor     v2.16b, v2.16b, v5.16b          // vpxor        %xmm5,  %xmm2,  %xmm2   # 2 = 2A
158         tbl     v0.16b, {v0.16b}, v4.16b        // vpshufb      %xmm4,  %xmm0,  %xmm0   # 3 = D
159         eor     v3.16b, v3.16b, v2.16b          // vpxor        %xmm2,  %xmm3,  %xmm3   # 0 = 2A+B
160         tbl     v4.16b, {v3.16b}, v1.16b        // vpshufb      %xmm1,  %xmm3,  %xmm4   # 0 = 2B+C
161         eor     v0.16b, v0.16b, v3.16b          // vpxor        %xmm3,  %xmm0,  %xmm0   # 3 = 2A+B+D
162         and     x11, x11, #~(1<<6)              // and          $0x30,  %r11            # ... mod 4
163         eor     v0.16b, v0.16b, v4.16b          // vpxor        %xmm4,  %xmm0, %xmm0    # 0 = 2A+3B+C+D
164         sub     w8, w8, #1                      // nr--
165
166 .Lenc_entry:
167         // top of round
168         and     v1.16b, v0.16b, v17.16b         // vpand        %xmm0,  %xmm9,  %xmm1   # 0 = k
169         ushr    v0.16b, v0.16b, #4              // vpsrlb       $4,     %xmm0,  %xmm0   # 1 = i
170         tbl     v5.16b, {v19.16b}, v1.16b       // vpshufb      %xmm1,  %xmm11, %xmm5   # 2 = a/k
171         eor     v1.16b, v1.16b, v0.16b          // vpxor        %xmm0,  %xmm1,  %xmm1   # 0 = j
172         tbl     v3.16b, {v18.16b}, v0.16b       // vpshufb      %xmm0,  %xmm10, %xmm3   # 3 = 1/i
173         tbl     v4.16b, {v18.16b}, v1.16b       // vpshufb      %xmm1,  %xmm10, %xmm4   # 4 = 1/j
174         eor     v3.16b, v3.16b, v5.16b          // vpxor        %xmm5,  %xmm3,  %xmm3   # 3 = iak = 1/i + a/k
175         eor     v4.16b, v4.16b, v5.16b          // vpxor        %xmm5,  %xmm4,  %xmm4   # 4 = jak = 1/j + a/k
176         tbl     v2.16b, {v18.16b}, v3.16b       // vpshufb      %xmm3,  %xmm10, %xmm2   # 2 = 1/iak
177         tbl     v3.16b, {v18.16b}, v4.16b       // vpshufb      %xmm4,  %xmm10, %xmm3   # 3 = 1/jak
178         eor     v2.16b, v2.16b, v1.16b          // vpxor        %xmm1,  %xmm2,  %xmm2   # 2 = io
179         eor     v3.16b, v3.16b, v0.16b          // vpxor        %xmm0,  %xmm3,  %xmm3   # 3 = jo
180         ld1     {v16.2d}, [x9],#16              // vmovdqu      (%r9),  %xmm5
181         cbnz    w8, .Lenc_loop
182
183         // middle of last round
184         add     x10, x11, #0x80
185                                                 // vmovdqa      -0x60(%r10), %xmm4      # 3 : sbou      .Lk_sbo
186                                                 // vmovdqa      -0x50(%r10), %xmm0      # 0 : sbot      .Lk_sbo+16
187         tbl     v4.16b, {v22.16b}, v2.16b               // vpshufb      %xmm2,  %xmm4,  %xmm4   # 4 = sbou
188         ld1     {v1.2d}, [x10]                  // vmovdqa      0x40(%r11,%r10), %xmm1  # .Lk_sr[]
189         tbl     v0.16b, {v23.16b}, v3.16b               // vpshufb      %xmm3,  %xmm0,  %xmm0   # 0 = sb1t
190         eor     v4.16b, v4.16b, v16.16b         // vpxor        %xmm5,  %xmm4,  %xmm4   # 4 = sb1u + k
191         eor     v0.16b, v0.16b, v4.16b          // vpxor        %xmm4,  %xmm0,  %xmm0   # 0 = A
192         tbl     v0.16b, {v0.16b}, v1.16b        // vpshufb      %xmm1,  %xmm0,  %xmm0
193         ret
194 .size   _vpaes_encrypt_core,.-_vpaes_encrypt_core
195
196 .globl  vpaes_encrypt
197 .type   vpaes_encrypt,%function
198 .align  4
199 vpaes_encrypt:
200         stp     x29,x30,[sp,#-16]!
201         add     x29,sp,#0
202
203         ld1     {v7.16b}, [x0]
204         bl      _vpaes_encrypt_preheat
205         bl      _vpaes_encrypt_core
206         st1     {v0.16b}, [x1]
207
208         ldp     x29,x30,[sp],#16
209         ret
210 .size   vpaes_encrypt,.-vpaes_encrypt
211
212 .type   _vpaes_encrypt_2x,%function
213 .align  4
214 _vpaes_encrypt_2x:
215         mov     x9, x2
216         ldr     w8, [x2,#240]                   // pull rounds
217         adr     x11, .Lk_mc_forward+16
218                                                 // vmovdqa      .Lk_ipt(%rip),  %xmm2   # iptlo
219         ld1     {v16.2d}, [x9], #16             // vmovdqu      (%r9),  %xmm5           # round0 key
220         and     v1.16b,  v14.16b,  v17.16b      // vpand        %xmm9,  %xmm0,  %xmm1
221         ushr    v0.16b,  v14.16b,  #4           // vpsrlb       $4,     %xmm0,  %xmm0
222         and     v9.16b,  v15.16b,  v17.16b
223         ushr    v8.16b,  v15.16b,  #4
224         tbl     v1.16b,  {v20.16b}, v1.16b      // vpshufb      %xmm1,  %xmm2,  %xmm1
225         tbl     v9.16b,  {v20.16b}, v9.16b
226                                                 // vmovdqa      .Lk_ipt+16(%rip), %xmm3 # ipthi
227         tbl     v2.16b,  {v21.16b}, v0.16b      // vpshufb      %xmm0,  %xmm3,  %xmm2
228         tbl     v10.16b, {v21.16b}, v8.16b
229         eor     v0.16b,  v1.16b,   v16.16b      // vpxor        %xmm5,  %xmm1,  %xmm0
230         eor     v8.16b,  v9.16b,   v16.16b
231         eor     v0.16b,  v0.16b,   v2.16b       // vpxor        %xmm2,  %xmm0,  %xmm0
232         eor     v8.16b,  v8.16b,   v10.16b
233         b       .Lenc_2x_entry
234
235 .align  4
236 .Lenc_2x_loop:
237         // middle of middle round
238         add     x10, x11, #0x40
239         tbl     v4.16b,  {v25.16b}, v2.16b      // vpshufb      %xmm2,  %xmm13, %xmm4   # 4 = sb1u
240         tbl     v12.16b, {v25.16b}, v10.16b
241         ld1     {v1.2d}, [x11], #16             // vmovdqa      -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
242         tbl     v0.16b,  {v24.16b}, v3.16b      // vpshufb      %xmm3,  %xmm12, %xmm0   # 0 = sb1t
243         tbl     v8.16b,  {v24.16b}, v11.16b
244         eor     v4.16b,  v4.16b,  v16.16b       // vpxor        %xmm5,  %xmm4,  %xmm4   # 4 = sb1u + k
245         eor     v12.16b, v12.16b, v16.16b
246         tbl     v5.16b,  {v27.16b}, v2.16b      // vpshufb      %xmm2,  %xmm15, %xmm5   # 4 = sb2u
247         tbl     v13.16b, {v27.16b}, v10.16b
248         eor     v0.16b,  v0.16b,  v4.16b        // vpxor        %xmm4,  %xmm0,  %xmm0   # 0 = A
249         eor     v8.16b,  v8.16b,  v12.16b
250         tbl     v2.16b,  {v26.16b}, v3.16b      // vpshufb      %xmm3,  %xmm14, %xmm2   # 2 = sb2t
251         tbl     v10.16b, {v26.16b}, v11.16b
252         ld1     {v4.2d}, [x10]                  // vmovdqa      (%r11,%r10), %xmm4      # .Lk_mc_backward[]
253         tbl     v3.16b,  {v0.16b}, v1.16b       // vpshufb      %xmm1,  %xmm0,  %xmm3   # 0 = B
254         tbl     v11.16b, {v8.16b}, v1.16b
255         eor     v2.16b,  v2.16b,  v5.16b        // vpxor        %xmm5,  %xmm2,  %xmm2   # 2 = 2A
256         eor     v10.16b, v10.16b, v13.16b
257         tbl     v0.16b,  {v0.16b}, v4.16b       // vpshufb      %xmm4,  %xmm0,  %xmm0   # 3 = D
258         tbl     v8.16b,  {v8.16b}, v4.16b
259         eor     v3.16b,  v3.16b,  v2.16b        // vpxor        %xmm2,  %xmm3,  %xmm3   # 0 = 2A+B
260         eor     v11.16b, v11.16b, v10.16b
261         tbl     v4.16b,  {v3.16b}, v1.16b       // vpshufb      %xmm1,  %xmm3,  %xmm4   # 0 = 2B+C
262         tbl     v12.16b, {v11.16b},v1.16b
263         eor     v0.16b,  v0.16b,  v3.16b        // vpxor        %xmm3,  %xmm0,  %xmm0   # 3 = 2A+B+D
264         eor     v8.16b,  v8.16b,  v11.16b
265         and     x11, x11, #~(1<<6)              // and          $0x30,  %r11            # ... mod 4
266         eor     v0.16b,  v0.16b,  v4.16b        // vpxor        %xmm4,  %xmm0, %xmm0    # 0 = 2A+3B+C+D
267         eor     v8.16b,  v8.16b,  v12.16b
268         sub     w8, w8, #1                      // nr--
269
270 .Lenc_2x_entry:
271         // top of round
272         and     v1.16b,  v0.16b, v17.16b        // vpand        %xmm0,  %xmm9,  %xmm1   # 0 = k
273         ushr    v0.16b,  v0.16b, #4             // vpsrlb       $4,     %xmm0,  %xmm0   # 1 = i
274         and     v9.16b,  v8.16b, v17.16b
275         ushr    v8.16b,  v8.16b, #4
276         tbl     v5.16b,  {v19.16b},v1.16b       // vpshufb      %xmm1,  %xmm11, %xmm5   # 2 = a/k
277         tbl     v13.16b, {v19.16b},v9.16b
278         eor     v1.16b,  v1.16b,  v0.16b        // vpxor        %xmm0,  %xmm1,  %xmm1   # 0 = j
279         eor     v9.16b,  v9.16b,  v8.16b
280         tbl     v3.16b,  {v18.16b},v0.16b       // vpshufb      %xmm0,  %xmm10, %xmm3   # 3 = 1/i
281         tbl     v11.16b, {v18.16b},v8.16b
282         tbl     v4.16b,  {v18.16b},v1.16b       // vpshufb      %xmm1,  %xmm10, %xmm4   # 4 = 1/j
283         tbl     v12.16b, {v18.16b},v9.16b
284         eor     v3.16b,  v3.16b,  v5.16b        // vpxor        %xmm5,  %xmm3,  %xmm3   # 3 = iak = 1/i + a/k
285         eor     v11.16b, v11.16b, v13.16b
286         eor     v4.16b,  v4.16b,  v5.16b        // vpxor        %xmm5,  %xmm4,  %xmm4   # 4 = jak = 1/j + a/k
287         eor     v12.16b, v12.16b, v13.16b
288         tbl     v2.16b,  {v18.16b},v3.16b       // vpshufb      %xmm3,  %xmm10, %xmm2   # 2 = 1/iak
289         tbl     v10.16b, {v18.16b},v11.16b
290         tbl     v3.16b,  {v18.16b},v4.16b       // vpshufb      %xmm4,  %xmm10, %xmm3   # 3 = 1/jak
291         tbl     v11.16b, {v18.16b},v12.16b
292         eor     v2.16b,  v2.16b,  v1.16b        // vpxor        %xmm1,  %xmm2,  %xmm2   # 2 = io
293         eor     v10.16b, v10.16b, v9.16b
294         eor     v3.16b,  v3.16b,  v0.16b        // vpxor        %xmm0,  %xmm3,  %xmm3   # 3 = jo
295         eor     v11.16b, v11.16b, v8.16b
296         ld1     {v16.2d}, [x9],#16              // vmovdqu      (%r9),  %xmm5
297         cbnz    w8, .Lenc_2x_loop
298
299         // middle of last round
300         add     x10, x11, #0x80
301                                                 // vmovdqa      -0x60(%r10), %xmm4      # 3 : sbou      .Lk_sbo
302                                                 // vmovdqa      -0x50(%r10), %xmm0      # 0 : sbot      .Lk_sbo+16
303         tbl     v4.16b,  {v22.16b}, v2.16b      // vpshufb      %xmm2,  %xmm4,  %xmm4   # 4 = sbou
304         tbl     v12.16b, {v22.16b}, v10.16b
305         ld1     {v1.2d}, [x10]                  // vmovdqa      0x40(%r11,%r10), %xmm1  # .Lk_sr[]
306         tbl     v0.16b,  {v23.16b}, v3.16b      // vpshufb      %xmm3,  %xmm0,  %xmm0   # 0 = sb1t
307         tbl     v8.16b,  {v23.16b}, v11.16b
308         eor     v4.16b,  v4.16b,  v16.16b       // vpxor        %xmm5,  %xmm4,  %xmm4   # 4 = sb1u + k
309         eor     v12.16b, v12.16b, v16.16b
310         eor     v0.16b,  v0.16b,  v4.16b        // vpxor        %xmm4,  %xmm0,  %xmm0   # 0 = A
311         eor     v8.16b,  v8.16b,  v12.16b
312         tbl     v0.16b,  {v0.16b},v1.16b        // vpshufb      %xmm1,  %xmm0,  %xmm0
313         tbl     v1.16b,  {v8.16b},v1.16b
314         ret
315 .size   _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
316
317 .type   _vpaes_decrypt_preheat,%function
318 .align  4
319 _vpaes_decrypt_preheat:
320         adr     x10, .Lk_inv
321         movi    v17.16b, #0x0f
322         adr     x11, .Lk_dipt
323         ld1     {v18.2d,v19.2d}, [x10],#32      // .Lk_inv
324         ld1     {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64        // .Lk_dipt, .Lk_dsbo
325         ld1     {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64        // .Lk_dsb9, .Lk_dsbd
326         ld1     {v28.2d,v29.2d,v30.2d,v31.2d}, [x11]            // .Lk_dsbb, .Lk_dsbe
327         ret
328 .size   _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
329
330 ##
331 ##  Decryption core
332 ##
333 ##  Same API as encryption core.
334 ##
335 .type   _vpaes_decrypt_core,%function
336 .align  4
337 _vpaes_decrypt_core:
338         mov     x9, x2
339         ldr     w8, [x2,#240]                   // pull rounds
340
341                                                 // vmovdqa      .Lk_dipt(%rip), %xmm2   # iptlo
342         lsl     x11, x8, #4                     // mov  %rax,   %r11;   shl     $4, %r11
343         eor     x11, x11, #0x30                 // xor          $0x30,  %r11
344         adr     x10, .Lk_sr
345         and     x11, x11, #0x30                 // and          $0x30,  %r11
346         add     x11, x11, x10
347         adr     x10, .Lk_mc_forward+48
348
349         ld1     {v16.2d}, [x9],#16              // vmovdqu      (%r9),  %xmm4           # round0 key
350         and     v1.16b, v7.16b, v17.16b         // vpand        %xmm9,  %xmm0,  %xmm1
351         ushr    v0.16b, v7.16b, #4              // vpsrlb       $4,     %xmm0,  %xmm0
352         tbl     v2.16b, {v20.16b}, v1.16b       // vpshufb      %xmm1,  %xmm2,  %xmm2
353         ld1     {v5.2d}, [x10]                  // vmovdqa      .Lk_mc_forward+48(%rip), %xmm5
354                                                 // vmovdqa      .Lk_dipt+16(%rip), %xmm1 # ipthi
355         tbl     v0.16b, {v21.16b}, v0.16b       // vpshufb      %xmm0,  %xmm1,  %xmm0
356         eor     v2.16b, v2.16b, v16.16b         // vpxor        %xmm4,  %xmm2,  %xmm2
357         eor     v0.16b, v0.16b, v2.16b          // vpxor        %xmm2,  %xmm0,  %xmm0
358         b       .Ldec_entry
359
360 .align  4
361 .Ldec_loop:
362 //
363 //  Inverse mix columns
364 //
365                                                 // vmovdqa      -0x20(%r10),%xmm4               # 4 : sb9u
366                                                 // vmovdqa      -0x10(%r10),%xmm1               # 0 : sb9t
367         tbl     v4.16b, {v24.16b}, v2.16b               // vpshufb      %xmm2,  %xmm4,  %xmm4           # 4 = sb9u
368         tbl     v1.16b, {v25.16b}, v3.16b               // vpshufb      %xmm3,  %xmm1,  %xmm1           # 0 = sb9t
369         eor     v0.16b, v4.16b, v16.16b         // vpxor        %xmm4,  %xmm0,  %xmm0
370                                                 // vmovdqa      0x00(%r10),%xmm4                # 4 : sbdu
371         eor     v0.16b, v0.16b, v1.16b          // vpxor        %xmm1,  %xmm0,  %xmm0           # 0 = ch
372                                                 // vmovdqa      0x10(%r10),%xmm1                # 0 : sbdt
373
374         tbl     v4.16b, {v26.16b}, v2.16b               // vpshufb      %xmm2,  %xmm4,  %xmm4           # 4 = sbdu
375         tbl     v0.16b, {v0.16b}, v5.16b        // vpshufb      %xmm5,  %xmm0,  %xmm0           # MC ch
376         tbl     v1.16b, {v27.16b}, v3.16b               // vpshufb      %xmm3,  %xmm1,  %xmm1           # 0 = sbdt
377         eor     v0.16b, v0.16b, v4.16b          // vpxor        %xmm4,  %xmm0,  %xmm0           # 4 = ch
378                                                 // vmovdqa      0x20(%r10),     %xmm4           # 4 : sbbu
379         eor     v0.16b, v0.16b, v1.16b          // vpxor        %xmm1,  %xmm0,  %xmm0           # 0 = ch
380                                                 // vmovdqa      0x30(%r10),     %xmm1           # 0 : sbbt
381
382         tbl     v4.16b, {v28.16b}, v2.16b               // vpshufb      %xmm2,  %xmm4,  %xmm4           # 4 = sbbu
383         tbl     v0.16b, {v0.16b}, v5.16b        // vpshufb      %xmm5,  %xmm0,  %xmm0           # MC ch
384         tbl     v1.16b, {v29.16b}, v3.16b               // vpshufb      %xmm3,  %xmm1,  %xmm1           # 0 = sbbt
385         eor     v0.16b, v0.16b, v4.16b          // vpxor        %xmm4,  %xmm0,  %xmm0           # 4 = ch
386                                                 // vmovdqa      0x40(%r10),     %xmm4           # 4 : sbeu
387         eor     v0.16b, v0.16b, v1.16b          // vpxor        %xmm1,  %xmm0,  %xmm0           # 0 = ch
388                                                 // vmovdqa      0x50(%r10),     %xmm1           # 0 : sbet
389
390         tbl     v4.16b, {v30.16b}, v2.16b               // vpshufb      %xmm2,  %xmm4,  %xmm4           # 4 = sbeu
391         tbl     v0.16b, {v0.16b}, v5.16b        // vpshufb      %xmm5,  %xmm0,  %xmm0           # MC ch
392         tbl     v1.16b, {v31.16b}, v3.16b               // vpshufb      %xmm3,  %xmm1,  %xmm1           # 0 = sbet
393         eor     v0.16b, v0.16b, v4.16b          // vpxor        %xmm4,  %xmm0,  %xmm0           # 4 = ch
394         ext     v5.16b, v5.16b, v5.16b, #12     // vpalignr $12,        %xmm5,  %xmm5,  %xmm5
395         eor     v0.16b, v0.16b, v1.16b          // vpxor        %xmm1,  %xmm0,  %xmm0           # 0 = ch
396         sub     w8, w8, #1                      // sub          $1,%rax                 # nr--
397
398 .Ldec_entry:
399         // top of round
400         and     v1.16b, v0.16b, v17.16b         // vpand        %xmm9,  %xmm0,  %xmm1   # 0 = k
401         ushr    v0.16b, v0.16b, #4              // vpsrlb       $4,     %xmm0,  %xmm0   # 1 = i
402         tbl     v2.16b, {v19.16b}, v1.16b       // vpshufb      %xmm1,  %xmm11, %xmm2   # 2 = a/k
403         eor     v1.16b, v1.16b, v0.16b          // vpxor        %xmm0,  %xmm1,  %xmm1   # 0 = j
404         tbl     v3.16b, {v18.16b}, v0.16b       // vpshufb      %xmm0,  %xmm10, %xmm3   # 3 = 1/i
405         tbl     v4.16b, {v18.16b}, v1.16b       // vpshufb      %xmm1,  %xmm10, %xmm4   # 4 = 1/j
406         eor     v3.16b, v3.16b, v2.16b          // vpxor        %xmm2,  %xmm3,  %xmm3   # 3 = iak = 1/i + a/k
407         eor     v4.16b, v4.16b, v2.16b          // vpxor        %xmm2,  %xmm4,  %xmm4   # 4 = jak = 1/j + a/k
408         tbl     v2.16b, {v18.16b}, v3.16b       // vpshufb      %xmm3,  %xmm10, %xmm2   # 2 = 1/iak
409         tbl     v3.16b, {v18.16b}, v4.16b       // vpshufb      %xmm4,  %xmm10, %xmm3   # 3 = 1/jak
410         eor     v2.16b, v2.16b, v1.16b          // vpxor        %xmm1,  %xmm2,  %xmm2   # 2 = io
411         eor     v3.16b, v3.16b, v0.16b          // vpxor        %xmm0,  %xmm3,  %xmm3   # 3 = jo
412         ld1     {v16.2d}, [x9],#16              // vmovdqu      (%r9),  %xmm0
413         cbnz    w8, .Ldec_loop
414
415         // middle of last round
416                                                 // vmovdqa      0x60(%r10),     %xmm4   # 3 : sbou
417         tbl     v4.16b, {v22.16b}, v2.16b               // vpshufb      %xmm2,  %xmm4,  %xmm4   # 4 = sbou
418                                                 // vmovdqa      0x70(%r10),     %xmm1   # 0 : sbot
419         ld1     {v2.2d}, [x11]                  // vmovdqa      -0x160(%r11),   %xmm2   # .Lk_sr-.Lk_dsbd=-0x160
420         tbl     v1.16b, {v23.16b}, v3.16b               // vpshufb      %xmm3,  %xmm1,  %xmm1   # 0 = sb1t
421         eor     v4.16b, v4.16b, v16.16b         // vpxor        %xmm0,  %xmm4,  %xmm4   # 4 = sb1u + k
422         eor     v0.16b, v1.16b, v4.16b          // vpxor        %xmm4,  %xmm1,  %xmm0   # 0 = A
423         tbl     v0.16b, {v0.16b}, v2.16b        // vpshufb      %xmm2,  %xmm0,  %xmm0
424         ret
425 .size   _vpaes_decrypt_core,.-_vpaes_decrypt_core
426
427 .globl  vpaes_decrypt
428 .type   vpaes_decrypt,%function
429 .align  4
430 vpaes_decrypt:
431         stp     x29,x30,[sp,#-16]!
432         add     x29,sp,#0
433
434         ld1     {v7.16b}, [x0]
435         bl      _vpaes_decrypt_preheat
436         bl      _vpaes_decrypt_core
437         st1     {v0.16b}, [x1]
438
439         ldp     x29,x30,[sp],#16
440         ret
441 .size   vpaes_decrypt,.-vpaes_decrypt
442
443 // v14-v15 input, v0-v1 output
444 .type   _vpaes_decrypt_2x,%function
445 .align  4
446 _vpaes_decrypt_2x:
447         mov     x9, x2
448         ldr     w8, [x2,#240]                   // pull rounds
449
450                                                 // vmovdqa      .Lk_dipt(%rip), %xmm2   # iptlo
451         lsl     x11, x8, #4                     // mov  %rax,   %r11;   shl     $4, %r11
452         eor     x11, x11, #0x30                 // xor          $0x30,  %r11
453         adr     x10, .Lk_sr
454         and     x11, x11, #0x30                 // and          $0x30,  %r11
455         add     x11, x11, x10
456         adr     x10, .Lk_mc_forward+48
457
458         ld1     {v16.2d}, [x9],#16              // vmovdqu      (%r9),  %xmm4           # round0 key
459         and     v1.16b,  v14.16b, v17.16b       // vpand        %xmm9,  %xmm0,  %xmm1
460         ushr    v0.16b,  v14.16b, #4            // vpsrlb       $4,     %xmm0,  %xmm0
461         and     v9.16b,  v15.16b, v17.16b
462         ushr    v8.16b,  v15.16b, #4
463         tbl     v2.16b,  {v20.16b},v1.16b       // vpshufb      %xmm1,  %xmm2,  %xmm2
464         tbl     v10.16b, {v20.16b},v9.16b
465         ld1     {v5.2d}, [x10]                  // vmovdqa      .Lk_mc_forward+48(%rip), %xmm5
466                                                 // vmovdqa      .Lk_dipt+16(%rip), %xmm1 # ipthi
467         tbl     v0.16b,  {v21.16b},v0.16b       // vpshufb      %xmm0,  %xmm1,  %xmm0
468         tbl     v8.16b,  {v21.16b},v8.16b
469         eor     v2.16b,  v2.16b,  v16.16b       // vpxor        %xmm4,  %xmm2,  %xmm2
470         eor     v10.16b, v10.16b, v16.16b
471         eor     v0.16b,  v0.16b,  v2.16b        // vpxor        %xmm2,  %xmm0,  %xmm0
472         eor     v8.16b,  v8.16b,  v10.16b
473         b       .Ldec_2x_entry
474
475 .align  4
476 .Ldec_2x_loop:
477 //
478 //  Inverse mix columns
479 //
480                                                 // vmovdqa      -0x20(%r10),%xmm4               # 4 : sb9u
481                                                 // vmovdqa      -0x10(%r10),%xmm1               # 0 : sb9t
482         tbl     v4.16b,  {v24.16b}, v2.16b      // vpshufb      %xmm2,  %xmm4,  %xmm4           # 4 = sb9u
483         tbl     v12.16b, {v24.16b}, v10.16b
484         tbl     v1.16b,  {v25.16b}, v3.16b      // vpshufb      %xmm3,  %xmm1,  %xmm1           # 0 = sb9t
485         tbl     v9.16b,  {v25.16b}, v11.16b
486         eor     v0.16b,  v4.16b,  v16.16b       // vpxor        %xmm4,  %xmm0,  %xmm0
487         eor     v8.16b,  v12.16b, v16.16b
488                                                 // vmovdqa      0x00(%r10),%xmm4                # 4 : sbdu
489         eor     v0.16b,  v0.16b,  v1.16b        // vpxor        %xmm1,  %xmm0,  %xmm0           # 0 = ch
490         eor     v8.16b,  v8.16b,  v9.16b        // vpxor        %xmm1,  %xmm0,  %xmm0           # 0 = ch
491                                                 // vmovdqa      0x10(%r10),%xmm1                # 0 : sbdt
492
493         tbl     v4.16b,  {v26.16b}, v2.16b      // vpshufb      %xmm2,  %xmm4,  %xmm4           # 4 = sbdu
494         tbl     v12.16b, {v26.16b}, v10.16b
495         tbl     v0.16b,  {v0.16b},v5.16b        // vpshufb      %xmm5,  %xmm0,  %xmm0           # MC ch
496         tbl     v8.16b,  {v8.16b},v5.16b
497         tbl     v1.16b,  {v27.16b}, v3.16b      // vpshufb      %xmm3,  %xmm1,  %xmm1           # 0 = sbdt
498         tbl     v9.16b,  {v27.16b}, v11.16b
499         eor     v0.16b,  v0.16b,  v4.16b        // vpxor        %xmm4,  %xmm0,  %xmm0           # 4 = ch
500         eor     v8.16b,  v8.16b,  v12.16b
501                                                 // vmovdqa      0x20(%r10),     %xmm4           # 4 : sbbu
502         eor     v0.16b,  v0.16b,  v1.16b        // vpxor        %xmm1,  %xmm0,  %xmm0           # 0 = ch
503         eor     v8.16b,  v8.16b,  v9.16b
504                                                 // vmovdqa      0x30(%r10),     %xmm1           # 0 : sbbt
505
506         tbl     v4.16b,  {v28.16b}, v2.16b      // vpshufb      %xmm2,  %xmm4,  %xmm4           # 4 = sbbu
507         tbl     v12.16b, {v28.16b}, v10.16b
508         tbl     v0.16b,  {v0.16b},v5.16b        // vpshufb      %xmm5,  %xmm0,  %xmm0           # MC ch
509         tbl     v8.16b,  {v8.16b},v5.16b
510         tbl     v1.16b,  {v29.16b}, v3.16b      // vpshufb      %xmm3,  %xmm1,  %xmm1           # 0 = sbbt
511         tbl     v9.16b,  {v29.16b}, v11.16b
512         eor     v0.16b,  v0.16b,  v4.16b        // vpxor        %xmm4,  %xmm0,  %xmm0           # 4 = ch
513         eor     v8.16b,  v8.16b,  v12.16b
514                                                 // vmovdqa      0x40(%r10),     %xmm4           # 4 : sbeu
515         eor     v0.16b,  v0.16b,  v1.16b        // vpxor        %xmm1,  %xmm0,  %xmm0           # 0 = ch
516         eor     v8.16b,  v8.16b,  v9.16b
517                                                 // vmovdqa      0x50(%r10),     %xmm1           # 0 : sbet
518
519         tbl     v4.16b,  {v30.16b}, v2.16b      // vpshufb      %xmm2,  %xmm4,  %xmm4           # 4 = sbeu
520         tbl     v12.16b, {v30.16b}, v10.16b
521         tbl     v0.16b,  {v0.16b},v5.16b        // vpshufb      %xmm5,  %xmm0,  %xmm0           # MC ch
522         tbl     v8.16b,  {v8.16b},v5.16b
523         tbl     v1.16b,  {v31.16b}, v3.16b      // vpshufb      %xmm3,  %xmm1,  %xmm1           # 0 = sbet
524         tbl     v9.16b,  {v31.16b}, v11.16b
525         eor     v0.16b,  v0.16b,  v4.16b        // vpxor        %xmm4,  %xmm0,  %xmm0           # 4 = ch
526         eor     v8.16b,  v8.16b,  v12.16b
527         ext     v5.16b,  v5.16b,  v5.16b, #12   // vpalignr $12,        %xmm5,  %xmm5,  %xmm5
528         eor     v0.16b,  v0.16b,  v1.16b        // vpxor        %xmm1,  %xmm0,  %xmm0           # 0 = ch
529         eor     v8.16b,  v8.16b,  v9.16b
530         sub     w8, w8, #1                      // sub          $1,%rax                 # nr--
531
532 .Ldec_2x_entry:
533         // top of round
534         and     v1.16b,  v0.16b,  v17.16b       // vpand        %xmm9,  %xmm0,  %xmm1   # 0 = k
535         ushr    v0.16b,  v0.16b,  #4            // vpsrlb       $4,     %xmm0,  %xmm0   # 1 = i
536         and     v9.16b,  v8.16b,  v17.16b
537         ushr    v8.16b,  v8.16b,  #4
538         tbl     v2.16b,  {v19.16b},v1.16b       // vpshufb      %xmm1,  %xmm11, %xmm2   # 2 = a/k
539         tbl     v10.16b, {v19.16b},v9.16b
540         eor     v1.16b,  v1.16b,  v0.16b        // vpxor        %xmm0,  %xmm1,  %xmm1   # 0 = j
541         eor     v9.16b,  v9.16b,  v8.16b
542         tbl     v3.16b,  {v18.16b},v0.16b       // vpshufb      %xmm0,  %xmm10, %xmm3   # 3 = 1/i
543         tbl     v11.16b, {v18.16b},v8.16b
544         tbl     v4.16b,  {v18.16b},v1.16b       // vpshufb      %xmm1,  %xmm10, %xmm4   # 4 = 1/j
545         tbl     v12.16b, {v18.16b},v9.16b
546         eor     v3.16b,  v3.16b,  v2.16b        // vpxor        %xmm2,  %xmm3,  %xmm3   # 3 = iak = 1/i + a/k
547         eor     v11.16b, v11.16b, v10.16b
548         eor     v4.16b,  v4.16b,  v2.16b        // vpxor        %xmm2,  %xmm4,  %xmm4   # 4 = jak = 1/j + a/k
549         eor     v12.16b, v12.16b, v10.16b
550         tbl     v2.16b,  {v18.16b},v3.16b       // vpshufb      %xmm3,  %xmm10, %xmm2   # 2 = 1/iak
551         tbl     v10.16b, {v18.16b},v11.16b
552         tbl     v3.16b,  {v18.16b},v4.16b       // vpshufb      %xmm4,  %xmm10, %xmm3   # 3 = 1/jak
553         tbl     v11.16b, {v18.16b},v12.16b
554         eor     v2.16b,  v2.16b,  v1.16b        // vpxor        %xmm1,  %xmm2,  %xmm2   # 2 = io
555         eor     v10.16b, v10.16b, v9.16b
556         eor     v3.16b,  v3.16b,  v0.16b        // vpxor        %xmm0,  %xmm3,  %xmm3   # 3 = jo
557         eor     v11.16b, v11.16b, v8.16b
558         ld1     {v16.2d}, [x9],#16              // vmovdqu      (%r9),  %xmm0
559         cbnz    w8, .Ldec_2x_loop
560
561         // middle of last round
562                                                 // vmovdqa      0x60(%r10),     %xmm4   # 3 : sbou
563         tbl     v4.16b,  {v22.16b}, v2.16b      // vpshufb      %xmm2,  %xmm4,  %xmm4   # 4 = sbou
564         tbl     v12.16b, {v22.16b}, v10.16b
565                                                 // vmovdqa      0x70(%r10),     %xmm1   # 0 : sbot
566         tbl     v1.16b,  {v23.16b}, v3.16b      // vpshufb      %xmm3,  %xmm1,  %xmm1   # 0 = sb1t
567         tbl     v9.16b,  {v23.16b}, v11.16b
568         ld1     {v2.2d}, [x11]                  // vmovdqa      -0x160(%r11),   %xmm2   # .Lk_sr-.Lk_dsbd=-0x160
569         eor     v4.16b,  v4.16b,  v16.16b       // vpxor        %xmm0,  %xmm4,  %xmm4   # 4 = sb1u + k
570         eor     v12.16b, v12.16b, v16.16b
571         eor     v0.16b,  v1.16b,  v4.16b        // vpxor        %xmm4,  %xmm1,  %xmm0   # 0 = A
572         eor     v8.16b,  v9.16b,  v12.16b
573         tbl     v0.16b,  {v0.16b},v2.16b        // vpshufb      %xmm2,  %xmm0,  %xmm0
574         tbl     v1.16b,  {v8.16b},v2.16b
575         ret
576 .size   _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
577 ########################################################
578 ##                                                    ##
579 ##                  AES key schedule                  ##
580 ##                                                    ##
581 ########################################################
582 .type   _vpaes_key_preheat,%function
583 .align  4
584 _vpaes_key_preheat:
585         adr     x10, .Lk_inv
586         movi    v16.16b, #0x5b                  // .Lk_s63
587         adr     x11, .Lk_sb1
588         movi    v17.16b, #0x0f                  // .Lk_s0F
589         ld1     {v18.2d,v19.2d,v20.2d,v21.2d}, [x10]            // .Lk_inv, .Lk_ipt
590         adr     x10, .Lk_dksd
591         ld1     {v22.2d,v23.2d}, [x11]          // .Lk_sb1
592         adr     x11, .Lk_mc_forward
593         ld1     {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64        // .Lk_dksd, .Lk_dksb
594         ld1     {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64        // .Lk_dkse, .Lk_dks9
595         ld1     {v8.2d}, [x10]                  // .Lk_rcon
596         ld1     {v9.2d}, [x11]                  // .Lk_mc_forward[0]
597         ret
598 .size   _vpaes_key_preheat,.-_vpaes_key_preheat
599
600 .type   _vpaes_schedule_core,%function
601 .align  4
602 _vpaes_schedule_core:
603         stp     x29, x30, [sp,#-16]!
604         add     x29,sp,#0
605
606         bl      _vpaes_key_preheat              // load the tables
607
608         ld1     {v0.16b}, [x0],#16              // vmovdqu      (%rdi), %xmm0           # load key (unaligned)
609
610         // input transform
611         mov     v3.16b, v0.16b                  // vmovdqa      %xmm0,  %xmm3
612         bl      _vpaes_schedule_transform
613         mov     v7.16b, v0.16b                  // vmovdqa      %xmm0,  %xmm7
614
615         adr     x10, .Lk_sr                     // lea  .Lk_sr(%rip),%r10
616         add     x8, x8, x10
617         cbnz    w3, .Lschedule_am_decrypting
618
619         // encrypting, output zeroth round key after transform
620         st1     {v0.2d}, [x2]                   // vmovdqu      %xmm0,  (%rdx)
621         b       .Lschedule_go
622
623 .Lschedule_am_decrypting:
624         // decrypting, output zeroth round key after shiftrows
625         ld1     {v1.2d}, [x8]                   // vmovdqa      (%r8,%r10),     %xmm1
626         tbl     v3.16b, {v3.16b}, v1.16b        // vpshufb  %xmm1,      %xmm3,  %xmm3
627         st1     {v3.2d}, [x2]                   // vmovdqu      %xmm3,  (%rdx)
628         eor     x8, x8, #0x30                   // xor  $0x30, %r8
629
630 .Lschedule_go:
631         cmp     w1, #192                        // cmp  $192,   %esi
632         b.hi    .Lschedule_256
633         b.eq    .Lschedule_192
634         // 128: fall though
635
636 ##
637 ##  .schedule_128
638 ##
639 ##  128-bit specific part of key schedule.
640 ##
641 ##  This schedule is really simple, because all its parts
642 ##  are accomplished by the subroutines.
643 ##
644 .Lschedule_128:
645         mov     x0, #10                 // mov  $10, %esi
646
647 .Loop_schedule_128:
648         sub     x0, x0, #1                      // dec  %esi
649         bl      _vpaes_schedule_round
650         cbz     x0, .Lschedule_mangle_last
651         bl      _vpaes_schedule_mangle          // write output
652         b       .Loop_schedule_128
653
654 ##
655 ##  .aes_schedule_192
656 ##
657 ##  192-bit specific part of key schedule.
658 ##
659 ##  The main body of this schedule is the same as the 128-bit
660 ##  schedule, but with more smearing.  The long, high side is
661 ##  stored in %xmm7 as before, and the short, low side is in
662 ##  the high bits of %xmm6.
663 ##
664 ##  This schedule is somewhat nastier, however, because each
665 ##  round produces 192 bits of key material, or 1.5 round keys.
666 ##  Therefore, on each cycle we do 2 rounds and produce 3 round
667 ##  keys.
668 ##
669 .align  4
670 .Lschedule_192:
671         sub     x0, x0, #8
672         ld1     {v0.16b}, [x0]          // vmovdqu      8(%rdi),%xmm0           # load key part 2 (very unaligned)
673         bl      _vpaes_schedule_transform       // input transform
674         mov     v6.16b, v0.16b                  // vmovdqa      %xmm0,  %xmm6           # save short part
675         eor     v4.16b, v4.16b, v4.16b          // vpxor        %xmm4,  %xmm4, %xmm4    # clear 4
676         ins     v6.d[0], v4.d[0]                // vmovhlps     %xmm4,  %xmm6,  %xmm6           # clobber low side with zeros
677         mov     x0, #4                  // mov  $4,     %esi
678
679 .Loop_schedule_192:
680         sub     x0, x0, #1                      // dec  %esi
681         bl      _vpaes_schedule_round
682         ext     v0.16b, v6.16b, v0.16b, #8      // vpalignr     $8,%xmm6,%xmm0,%xmm0
683         bl      _vpaes_schedule_mangle          // save key n
684         bl      _vpaes_schedule_192_smear
685         bl      _vpaes_schedule_mangle          // save key n+1
686         bl      _vpaes_schedule_round
687         cbz     x0, .Lschedule_mangle_last
688         bl      _vpaes_schedule_mangle          // save key n+2
689         bl      _vpaes_schedule_192_smear
690         b       .Loop_schedule_192
691
692 ##
693 ##  .aes_schedule_256
694 ##
695 ##  256-bit specific part of key schedule.
696 ##
697 ##  The structure here is very similar to the 128-bit
698 ##  schedule, but with an additional "low side" in
699 ##  %xmm6.  The low side's rounds are the same as the
700 ##  high side's, except no rcon and no rotation.
701 ##
702 .align  4
703 .Lschedule_256:
704         ld1     {v0.16b}, [x0]          // vmovdqu      16(%rdi),%xmm0          # load key part 2 (unaligned)
705         bl      _vpaes_schedule_transform       // input transform
706         mov     x0, #7                  // mov  $7, %esi
707
708 .Loop_schedule_256:
709         sub     x0, x0, #1                      // dec  %esi
710         bl      _vpaes_schedule_mangle          // output low result
711         mov     v6.16b, v0.16b                  // vmovdqa      %xmm0,  %xmm6           # save cur_lo in xmm6
712
713         // high round
714         bl      _vpaes_schedule_round
715         cbz     x0, .Lschedule_mangle_last
716         bl      _vpaes_schedule_mangle
717
718         // low round. swap xmm7 and xmm6
719         dup     v0.4s, v0.s[3]                  // vpshufd      $0xFF,  %xmm0,  %xmm0
720         movi    v4.16b, #0
721         mov     v5.16b, v7.16b                  // vmovdqa      %xmm7,  %xmm5
722         mov     v7.16b, v6.16b                  // vmovdqa      %xmm6,  %xmm7
723         bl      _vpaes_schedule_low_round
724         mov     v7.16b, v5.16b                  // vmovdqa      %xmm5,  %xmm7
725
726         b       .Loop_schedule_256
727
728 ##
729 ##  .aes_schedule_mangle_last
730 ##
731 ##  Mangler for last round of key schedule
732 ##  Mangles %xmm0
733 ##    when encrypting, outputs out(%xmm0) ^ 63
734 ##    when decrypting, outputs unskew(%xmm0)
735 ##
736 ##  Always called right before return... jumps to cleanup and exits
737 ##
738 .align  4
739 .Lschedule_mangle_last:
740         // schedule last round key from xmm0
741         adr     x11, .Lk_deskew                 // lea  .Lk_deskew(%rip),%r11   # prepare to deskew
742         cbnz    w3, .Lschedule_mangle_last_dec
743
744         // encrypting
745         ld1     {v1.2d}, [x8]                   // vmovdqa      (%r8,%r10),%xmm1
746         adr     x11, .Lk_opt                    // lea  .Lk_opt(%rip),  %r11            # prepare to output transform
747         add     x2, x2, #32                     // add  $32,    %rdx
748         tbl     v0.16b, {v0.16b}, v1.16b        // vpshufb      %xmm1,  %xmm0,  %xmm0           # output permute
749
750 .Lschedule_mangle_last_dec:
751         ld1     {v20.2d,v21.2d}, [x11]          // reload constants
752         sub     x2, x2, #16                     // add  $-16,   %rdx
753         eor     v0.16b, v0.16b, v16.16b         // vpxor        .Lk_s63(%rip),  %xmm0,  %xmm0
754         bl      _vpaes_schedule_transform       // output transform
755         st1     {v0.2d}, [x2]                   // vmovdqu      %xmm0,  (%rdx)          # save last key
756
757         // cleanup
758         eor     v0.16b, v0.16b, v0.16b          // vpxor        %xmm0,  %xmm0,  %xmm0
759         eor     v1.16b, v1.16b, v1.16b          // vpxor        %xmm1,  %xmm1,  %xmm1
760         eor     v2.16b, v2.16b, v2.16b          // vpxor        %xmm2,  %xmm2,  %xmm2
761         eor     v3.16b, v3.16b, v3.16b          // vpxor        %xmm3,  %xmm3,  %xmm3
762         eor     v4.16b, v4.16b, v4.16b          // vpxor        %xmm4,  %xmm4,  %xmm4
763         eor     v5.16b, v5.16b, v5.16b          // vpxor        %xmm5,  %xmm5,  %xmm5
764         eor     v6.16b, v6.16b, v6.16b          // vpxor        %xmm6,  %xmm6,  %xmm6
765         eor     v7.16b, v7.16b, v7.16b          // vpxor        %xmm7,  %xmm7,  %xmm7
766         ldp     x29, x30, [sp],#16
767         ret
768 .size   _vpaes_schedule_core,.-_vpaes_schedule_core
769
770 ##
771 ##  .aes_schedule_192_smear
772 ##
773 ##  Smear the short, low side in the 192-bit key schedule.
774 ##
775 ##  Inputs:
776 ##    %xmm7: high side, b  a  x  y
777 ##    %xmm6:  low side, d  c  0  0
778 ##    %xmm13: 0
779 ##
780 ##  Outputs:
781 ##    %xmm6: b+c+d  b+c  0  0
782 ##    %xmm0: b+c+d  b+c  b  a
783 ##
784 .type   _vpaes_schedule_192_smear,%function
785 .align  4
786 _vpaes_schedule_192_smear:
787         movi    v1.16b, #0
788         dup     v0.4s, v7.s[3]
789         ins     v1.s[3], v6.s[2]        // vpshufd      $0x80,  %xmm6,  %xmm1   # d c 0 0 -> c 0 0 0
790         ins     v0.s[0], v7.s[2]        // vpshufd      $0xFE,  %xmm7,  %xmm0   # b a _ _ -> b b b a
791         eor     v6.16b, v6.16b, v1.16b  // vpxor        %xmm1,  %xmm6,  %xmm6   # -> c+d c 0 0
792         eor     v1.16b, v1.16b, v1.16b  // vpxor        %xmm1,  %xmm1,  %xmm1
793         eor     v6.16b, v6.16b, v0.16b  // vpxor        %xmm0,  %xmm6,  %xmm6   # -> b+c+d b+c b a
794         mov     v0.16b, v6.16b          // vmovdqa      %xmm6,  %xmm0
795         ins     v6.d[0], v1.d[0]        // vmovhlps     %xmm1,  %xmm6,  %xmm6   # clobber low side with zeros
796         ret
797 .size   _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
798
799 ##
800 ##  .aes_schedule_round
801 ##
802 ##  Runs one main round of the key schedule on %xmm0, %xmm7
803 ##
804 ##  Specifically, runs subbytes on the high dword of %xmm0
805 ##  then rotates it by one byte and xors into the low dword of
806 ##  %xmm7.
807 ##
808 ##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
809 ##  next rcon.
810 ##
811 ##  Smears the dwords of %xmm7 by xoring the low into the
812 ##  second low, result into third, result into highest.
813 ##
814 ##  Returns results in %xmm7 = %xmm0.
815 ##  Clobbers %xmm1-%xmm4, %r11.
816 ##
817 .type   _vpaes_schedule_round,%function
818 .align  4
819 _vpaes_schedule_round:
820         // extract rcon from xmm8
821         movi    v4.16b, #0                      // vpxor        %xmm4,  %xmm4,  %xmm4
822         ext     v1.16b, v8.16b, v4.16b, #15     // vpalignr     $15,    %xmm8,  %xmm4,  %xmm1
823         ext     v8.16b, v8.16b, v8.16b, #15     // vpalignr     $15,    %xmm8,  %xmm8,  %xmm8
824         eor     v7.16b, v7.16b, v1.16b          // vpxor        %xmm1,  %xmm7,  %xmm7
825
826         // rotate
827         dup     v0.4s, v0.s[3]                  // vpshufd      $0xFF,  %xmm0,  %xmm0
828         ext     v0.16b, v0.16b, v0.16b, #1      // vpalignr     $1,     %xmm0,  %xmm0,  %xmm0
829
830         // fall through...
831
832         // low round: same as high round, but no rotation and no rcon.
833 _vpaes_schedule_low_round:
834         // smear xmm7
835         ext     v1.16b, v4.16b, v7.16b, #12     // vpslldq      $4,     %xmm7,  %xmm1
836         eor     v7.16b, v7.16b, v1.16b          // vpxor        %xmm1,  %xmm7,  %xmm7
837         ext     v4.16b, v4.16b, v7.16b, #8      // vpslldq      $8,     %xmm7,  %xmm4
838
839         // subbytes
840         and     v1.16b, v0.16b, v17.16b         // vpand        %xmm9,  %xmm0,  %xmm1           # 0 = k
841         ushr    v0.16b, v0.16b, #4              // vpsrlb       $4,     %xmm0,  %xmm0           # 1 = i
842         eor     v7.16b, v7.16b, v4.16b          // vpxor        %xmm4,  %xmm7,  %xmm7
843         tbl     v2.16b, {v19.16b}, v1.16b       // vpshufb      %xmm1,  %xmm11, %xmm2           # 2 = a/k
844         eor     v1.16b, v1.16b, v0.16b          // vpxor        %xmm0,  %xmm1,  %xmm1           # 0 = j
845         tbl     v3.16b, {v18.16b}, v0.16b       // vpshufb      %xmm0,  %xmm10, %xmm3           # 3 = 1/i
846         eor     v3.16b, v3.16b, v2.16b          // vpxor        %xmm2,  %xmm3,  %xmm3           # 3 = iak = 1/i + a/k
847         tbl     v4.16b, {v18.16b}, v1.16b       // vpshufb      %xmm1,  %xmm10, %xmm4           # 4 = 1/j
848         eor     v7.16b, v7.16b, v16.16b         // vpxor        .Lk_s63(%rip),  %xmm7,  %xmm7
849         tbl     v3.16b, {v18.16b}, v3.16b       // vpshufb      %xmm3,  %xmm10, %xmm3           # 2 = 1/iak
850         eor     v4.16b, v4.16b, v2.16b          // vpxor        %xmm2,  %xmm4,  %xmm4           # 4 = jak = 1/j + a/k
851         tbl     v2.16b, {v18.16b}, v4.16b       // vpshufb      %xmm4,  %xmm10, %xmm2           # 3 = 1/jak
852         eor     v3.16b, v3.16b, v1.16b          // vpxor        %xmm1,  %xmm3,  %xmm3           # 2 = io
853         eor     v2.16b, v2.16b, v0.16b          // vpxor        %xmm0,  %xmm2,  %xmm2           # 3 = jo
854         tbl     v4.16b, {v23.16b}, v3.16b       // vpshufb      %xmm3,  %xmm13, %xmm4           # 4 = sbou
855         tbl     v1.16b, {v22.16b}, v2.16b       // vpshufb      %xmm2,  %xmm12, %xmm1           # 0 = sb1t
856         eor     v1.16b, v1.16b, v4.16b          // vpxor        %xmm4,  %xmm1,  %xmm1           # 0 = sbox output
857
858         // add in smeared stuff
859         eor     v0.16b, v1.16b, v7.16b          // vpxor        %xmm7,  %xmm1,  %xmm0
860         eor     v7.16b, v1.16b, v7.16b          // vmovdqa      %xmm0,  %xmm7
861         ret
862 .size   _vpaes_schedule_round,.-_vpaes_schedule_round
863
864 ##
865 ##  .aes_schedule_transform
866 ##
867 ##  Linear-transform %xmm0 according to tables at (%r11)
868 ##
869 ##  Requires that %xmm9 = 0x0F0F... as in preheat
870 ##  Output in %xmm0
871 ##  Clobbers %xmm1, %xmm2
872 ##
873 .type   _vpaes_schedule_transform,%function
874 .align  4
875 _vpaes_schedule_transform:
876         and     v1.16b, v0.16b, v17.16b         // vpand        %xmm9,  %xmm0,  %xmm1
877         ushr    v0.16b, v0.16b, #4              // vpsrlb       $4,     %xmm0,  %xmm0
878                                                 // vmovdqa      (%r11), %xmm2   # lo
879         tbl     v2.16b, {v20.16b}, v1.16b       // vpshufb      %xmm1,  %xmm2,  %xmm2
880                                                 // vmovdqa      16(%r11),       %xmm1 # hi
881         tbl     v0.16b, {v21.16b}, v0.16b       // vpshufb      %xmm0,  %xmm1,  %xmm0
882         eor     v0.16b, v0.16b, v2.16b          // vpxor        %xmm2,  %xmm0,  %xmm0
883         ret
884 .size   _vpaes_schedule_transform,.-_vpaes_schedule_transform
885
886 ##
887 ##  .aes_schedule_mangle
888 ##
889 ##  Mangle xmm0 from (basis-transformed) standard version
890 ##  to our version.
891 ##
892 ##  On encrypt,
893 ##    xor with 0x63
894 ##    multiply by circulant 0,1,1,1
895 ##    apply shiftrows transform
896 ##
897 ##  On decrypt,
898 ##    xor with 0x63
899 ##    multiply by "inverse mixcolumns" circulant E,B,D,9
900 ##    deskew
901 ##    apply shiftrows transform
902 ##
903 ##
904 ##  Writes out to (%rdx), and increments or decrements it
905 ##  Keeps track of round number mod 4 in %r8
906 ##  Preserves xmm0
907 ##  Clobbers xmm1-xmm5
908 ##
909 .type   _vpaes_schedule_mangle,%function
910 .align  4
911 _vpaes_schedule_mangle:
912         mov     v4.16b, v0.16b                  // vmovdqa      %xmm0,  %xmm4   # save xmm0 for later
913                                                 // vmovdqa      .Lk_mc_forward(%rip),%xmm5
914         cbnz    w3, .Lschedule_mangle_dec
915
916         // encrypting
917         eor     v4.16b, v0.16b, v16.16b         // vpxor        .Lk_s63(%rip),  %xmm0,  %xmm4
918         add     x2, x2, #16                     // add  $16,    %rdx
919         tbl     v4.16b, {v4.16b}, v9.16b        // vpshufb      %xmm5,  %xmm4,  %xmm4
920         tbl     v1.16b, {v4.16b}, v9.16b        // vpshufb      %xmm5,  %xmm4,  %xmm1
921         tbl     v3.16b, {v1.16b}, v9.16b        // vpshufb      %xmm5,  %xmm1,  %xmm3
922         eor     v4.16b, v4.16b, v1.16b          // vpxor        %xmm1,  %xmm4,  %xmm4
923         ld1     {v1.2d}, [x8]                   // vmovdqa      (%r8,%r10),     %xmm1
924         eor     v3.16b, v3.16b, v4.16b          // vpxor        %xmm4,  %xmm3,  %xmm3
925
926         b       .Lschedule_mangle_both
927 .align  4
928 .Lschedule_mangle_dec:
929         // inverse mix columns
930                                                 // lea  .Lk_dksd(%rip),%r11
931         ushr    v1.16b, v4.16b, #4              // vpsrlb       $4,     %xmm4,  %xmm1   # 1 = hi
932         and     v4.16b, v4.16b, v17.16b         // vpand        %xmm9,  %xmm4,  %xmm4   # 4 = lo
933
934                                                 // vmovdqa      0x00(%r11),     %xmm2
935         tbl     v2.16b, {v24.16b}, v4.16b       // vpshufb      %xmm4,  %xmm2,  %xmm2
936                                                 // vmovdqa      0x10(%r11),     %xmm3
937         tbl     v3.16b, {v25.16b}, v1.16b       // vpshufb      %xmm1,  %xmm3,  %xmm3
938         eor     v3.16b, v3.16b, v2.16b          // vpxor        %xmm2,  %xmm3,  %xmm3
939         tbl     v3.16b, {v3.16b}, v9.16b        // vpshufb      %xmm5,  %xmm3,  %xmm3
940
941                                                 // vmovdqa      0x20(%r11),     %xmm2
942         tbl     v2.16b, {v26.16b}, v4.16b       // vpshufb      %xmm4,  %xmm2,  %xmm2
943         eor     v2.16b, v2.16b, v3.16b          // vpxor        %xmm3,  %xmm2,  %xmm2
944                                                 // vmovdqa      0x30(%r11),     %xmm3
945         tbl     v3.16b, {v27.16b}, v1.16b       // vpshufb      %xmm1,  %xmm3,  %xmm3
946         eor     v3.16b, v3.16b, v2.16b          // vpxor        %xmm2,  %xmm3,  %xmm3
947         tbl     v3.16b, {v3.16b}, v9.16b        // vpshufb      %xmm5,  %xmm3,  %xmm3
948
949                                                 // vmovdqa      0x40(%r11),     %xmm2
950         tbl     v2.16b, {v28.16b}, v4.16b       // vpshufb      %xmm4,  %xmm2,  %xmm2
951         eor     v2.16b, v2.16b, v3.16b          // vpxor        %xmm3,  %xmm2,  %xmm2
952                                                 // vmovdqa      0x50(%r11),     %xmm3
953         tbl     v3.16b, {v29.16b}, v1.16b       // vpshufb      %xmm1,  %xmm3,  %xmm3
954         eor     v3.16b, v3.16b, v2.16b          // vpxor        %xmm2,  %xmm3,  %xmm3
955
956                                                 // vmovdqa      0x60(%r11),     %xmm2
957         tbl     v2.16b, {v30.16b}, v4.16b       // vpshufb      %xmm4,  %xmm2,  %xmm2
958         tbl     v3.16b, {v3.16b}, v9.16b        // vpshufb      %xmm5,  %xmm3,  %xmm3
959                                                 // vmovdqa      0x70(%r11),     %xmm4
960         tbl     v4.16b, {v31.16b}, v1.16b       // vpshufb      %xmm1,  %xmm4,  %xmm4
961         ld1     {v1.2d}, [x8]                   // vmovdqa      (%r8,%r10),     %xmm1
962         eor     v2.16b, v2.16b, v3.16b          // vpxor        %xmm3,  %xmm2,  %xmm2
963         eor     v3.16b, v4.16b, v2.16b          // vpxor        %xmm2,  %xmm4,  %xmm3
964
965         sub     x2, x2, #16                     // add  $-16,   %rdx
966
967 .Lschedule_mangle_both:
968         tbl     v3.16b, {v3.16b}, v1.16b        // vpshufb      %xmm1,  %xmm3,  %xmm3
969         add     x8, x8, #64-16                  // add  $-16,   %r8
970         and     x8, x8, #~(1<<6)                // and  $0x30,  %r8
971         st1     {v3.2d}, [x2]                   // vmovdqu      %xmm3,  (%rdx)
972         ret
973 .size   _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
974
975 .globl  vpaes_set_encrypt_key
976 .type   vpaes_set_encrypt_key,%function
977 .align  4
978 vpaes_set_encrypt_key:
979         stp     x29,x30,[sp,#-16]!
980         add     x29,sp,#0
981         stp     d8,d9,[sp,#-16]!        // ABI spec says so
982
983         lsr     w9, w1, #5              // shr  $5,%eax
984         add     w9, w9, #5              // $5,%eax
985         str     w9, [x2,#240]           // mov  %eax,240(%rdx)  # AES_KEY->rounds = nbits/32+5;
986
987         mov     w3, #0          // mov  $0,%ecx
988         mov     x8, #0x30               // mov  $0x30,%r8d
989         bl      _vpaes_schedule_core
990         eor     x0, x0, x0
991
992         ldp     d8,d9,[sp],#16
993         ldp     x29,x30,[sp],#16
994         ret
995 .size   vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
996
997 .globl  vpaes_set_decrypt_key
998 .type   vpaes_set_decrypt_key,%function
999 .align  4
1000 vpaes_set_decrypt_key:
1001         stp     x29,x30,[sp,#-16]!
1002         add     x29,sp,#0
1003         stp     d8,d9,[sp,#-16]!        // ABI spec says so
1004
1005         lsr     w9, w1, #5              // shr  $5,%eax
1006         add     w9, w9, #5              // $5,%eax
1007         str     w9, [x2,#240]           // mov  %eax,240(%rdx)  # AES_KEY->rounds = nbits/32+5;
1008         lsl     w9, w9, #4              // shl  $4,%eax
1009         add     x2, x2, #16             // lea  16(%rdx,%rax),%rdx
1010         add     x2, x2, x9
1011
1012         mov     w3, #1          // mov  $1,%ecx
1013         lsr     w8, w1, #1              // shr  $1,%r8d
1014         and     x8, x8, #32             // and  $32,%r8d
1015         eor     x8, x8, #32             // xor  $32,%r8d        # nbits==192?0:32
1016         bl      _vpaes_schedule_core
1017
1018         ldp     d8,d9,[sp],#16
1019         ldp     x29,x30,[sp],#16
1020         ret
1021 .size   vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1022 .globl  vpaes_cbc_encrypt
1023 .type   vpaes_cbc_encrypt,%function
1024 .align  4
1025 vpaes_cbc_encrypt:
1026         cbz     x2, .Lcbc_abort
1027         cmp     w5, #0                  // check direction
1028         b.eq    vpaes_cbc_decrypt
1029
1030         stp     x29,x30,[sp,#-16]!
1031         add     x29,sp,#0
1032
1033         mov     x17, x2         // reassign
1034         mov     x2,  x3         // reassign
1035
1036         ld1     {v0.16b}, [x4]  // load ivec
1037         bl      _vpaes_encrypt_preheat
1038         b       .Lcbc_enc_loop
1039
1040 .align  4
1041 .Lcbc_enc_loop:
1042         ld1     {v7.16b}, [x0],#16      // load input
1043         eor     v7.16b, v7.16b, v0.16b  // xor with ivec
1044         bl      _vpaes_encrypt_core
1045         st1     {v0.16b}, [x1],#16      // save output
1046         subs    x17, x17, #16
1047         b.hi    .Lcbc_enc_loop
1048
1049         st1     {v0.16b}, [x4]  // write ivec
1050
1051         ldp     x29,x30,[sp],#16
1052 .Lcbc_abort:
1053         ret
1054 .size   vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1055
1056 .type   vpaes_cbc_decrypt,%function
1057 .align  4
1058 vpaes_cbc_decrypt:
1059         stp     x29,x30,[sp,#-16]!
1060         add     x29,sp,#0
1061         stp     d8,d9,[sp,#-16]!        // ABI spec says so
1062         stp     d10,d11,[sp,#-16]!
1063         stp     d12,d13,[sp,#-16]!
1064         stp     d14,d15,[sp,#-16]!
1065
1066         mov     x17, x2         // reassign
1067         mov     x2,  x3         // reassign
1068         ld1     {v6.16b}, [x4]  // load ivec
1069         bl      _vpaes_decrypt_preheat
1070         tst     x17, #16
1071         b.eq    .Lcbc_dec_loop2x
1072
1073         ld1     {v7.16b}, [x0], #16     // load input
1074         bl      _vpaes_decrypt_core
1075         eor     v0.16b, v0.16b, v6.16b  // xor with ivec
1076         orr     v6.16b, v7.16b, v7.16b  // next ivec value
1077         st1     {v0.16b}, [x1], #16
1078         subs    x17, x17, #16
1079         b.ls    .Lcbc_dec_done
1080
1081 .align  4
1082 .Lcbc_dec_loop2x:
1083         ld1     {v14.16b,v15.16b}, [x0], #32
1084         bl      _vpaes_decrypt_2x
1085         eor     v0.16b, v0.16b, v6.16b  // xor with ivec
1086         eor     v1.16b, v1.16b, v14.16b
1087         orr     v6.16b, v15.16b, v15.16b
1088         st1     {v0.16b,v1.16b}, [x1], #32
1089         subs    x17, x17, #32
1090         b.hi    .Lcbc_dec_loop2x
1091
1092 .Lcbc_dec_done:
1093         st1     {v6.16b}, [x4]
1094
1095         ldp     d14,d15,[sp],#16
1096         ldp     d12,d13,[sp],#16
1097         ldp     d10,d11,[sp],#16
1098         ldp     d8,d9,[sp],#16
1099         ldp     x29,x30,[sp],#16
1100         ret
1101 .size   vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1102 .globl  vpaes_ecb_encrypt
1103 .type   vpaes_ecb_encrypt,%function
1104 .align  4
1105 vpaes_ecb_encrypt:
1106         stp     x29,x30,[sp,#-16]!
1107         add     x29,sp,#0
1108         stp     d8,d9,[sp,#-16]!        // ABI spec says so
1109         stp     d10,d11,[sp,#-16]!
1110         stp     d12,d13,[sp,#-16]!
1111         stp     d14,d15,[sp,#-16]!
1112
1113         mov     x17, x2
1114         mov     x2,  x3
1115         bl      _vpaes_encrypt_preheat
1116         tst     x17, #16
1117         b.eq    .Lecb_enc_loop
1118
1119         ld1     {v7.16b}, [x0],#16
1120         bl      _vpaes_encrypt_core
1121         st1     {v0.16b}, [x1],#16
1122         subs    x17, x17, #16
1123         b.ls    .Lecb_enc_done
1124
1125 .align  4
1126 .Lecb_enc_loop:
1127         ld1     {v14.16b,v15.16b}, [x0], #32
1128         bl      _vpaes_encrypt_2x
1129         st1     {v0.16b,v1.16b}, [x1], #32
1130         subs    x17, x17, #32
1131         b.hi    .Lecb_enc_loop
1132
1133 .Lecb_enc_done:
1134         ldp     d14,d15,[sp],#16
1135         ldp     d12,d13,[sp],#16
1136         ldp     d10,d11,[sp],#16
1137         ldp     d8,d9,[sp],#16
1138         ldp     x29,x30,[sp],#16
1139         ret
1140 .size   vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1141
1142 .globl  vpaes_ecb_decrypt
1143 .type   vpaes_ecb_decrypt,%function
1144 .align  4
1145 vpaes_ecb_decrypt:
1146         stp     x29,x30,[sp,#-16]!
1147         add     x29,sp,#0
1148         stp     d8,d9,[sp,#-16]!        // ABI spec says so
1149         stp     d10,d11,[sp,#-16]!
1150         stp     d12,d13,[sp,#-16]!
1151         stp     d14,d15,[sp,#-16]!
1152
1153         mov     x17, x2
1154         mov     x2,  x3
1155         bl      _vpaes_decrypt_preheat
1156         tst     x17, #16
1157         b.eq    .Lecb_dec_loop
1158
1159         ld1     {v7.16b}, [x0],#16
1160         bl      _vpaes_encrypt_core
1161         st1     {v0.16b}, [x1],#16
1162         subs    x17, x17, #16
1163         b.ls    .Lecb_dec_done
1164
1165 .align  4
1166 .Lecb_dec_loop:
1167         ld1     {v14.16b,v15.16b}, [x0], #32
1168         bl      _vpaes_decrypt_2x
1169         st1     {v0.16b,v1.16b}, [x1], #32
1170         subs    x17, x17, #32
1171         b.hi    .Lecb_dec_loop
1172
1173 .Lecb_dec_done:
1174         ldp     d14,d15,[sp],#16
1175         ldp     d12,d13,[sp],#16
1176         ldp     d10,d11,[sp],#16
1177         ldp     d8,d9,[sp],#16
1178         ldp     x29,x30,[sp],#16
1179         ret
1180 .size   vpaes_ecb_decrypt,.-vpaes_ecb_decrypt