]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bearssl/src/symcipher/aes_x86ni_cbcdec.c
Add libbearssl
[FreeBSD/FreeBSD.git] / contrib / bearssl / src / symcipher / aes_x86ni_cbcdec.c
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining 
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be 
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24
25 #define BR_ENABLE_INTRINSICS   1
26 #include "inner.h"
27
28 #if BR_AES_X86NI
29
30 /* see bearssl_block.h */
31 const br_block_cbcdec_class *
32 br_aes_x86ni_cbcdec_get_vtable(void)
33 {
34         return br_aes_x86ni_supported() ? &br_aes_x86ni_cbcdec_vtable : NULL;
35 }
36
37 /* see bearssl_block.h */
38 void
39 br_aes_x86ni_cbcdec_init(br_aes_x86ni_cbcdec_keys *ctx,
40         const void *key, size_t len)
41 {
42         ctx->vtable = &br_aes_x86ni_cbcdec_vtable;
43         ctx->num_rounds = br_aes_x86ni_keysched_dec(ctx->skey.skni, key, len);
44 }
45
46 BR_TARGETS_X86_UP
47
48 /* see bearssl_block.h */
49 BR_TARGET("sse2,aes")
50 void
51 br_aes_x86ni_cbcdec_run(const br_aes_x86ni_cbcdec_keys *ctx,
52         void *iv, void *data, size_t len)
53 {
54         unsigned char *buf;
55         unsigned num_rounds;
56         __m128i sk[15], ivx;
57         unsigned u;
58
59         buf = data;
60         ivx = _mm_loadu_si128(iv);
61         num_rounds = ctx->num_rounds;
62         for (u = 0; u <= num_rounds; u ++) {
63                 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
64         }
65         while (len > 0) {
66                 __m128i x0, x1, x2, x3, e0, e1, e2, e3;
67
68                 x0 = _mm_loadu_si128((void *)(buf +  0));
69                 if (len >= 64) {
70                         x1 = _mm_loadu_si128((void *)(buf + 16));
71                         x2 = _mm_loadu_si128((void *)(buf + 32));
72                         x3 = _mm_loadu_si128((void *)(buf + 48));
73                 } else {
74                         x0 = _mm_loadu_si128((void *)(buf +  0));
75                         if (len >= 32) {
76                                 x1 = _mm_loadu_si128((void *)(buf + 16));
77                                 if (len >= 48) {
78                                         x2 = _mm_loadu_si128(
79                                                 (void *)(buf + 32));
80                                         x3 = x2;
81                                 } else {
82                                         x2 = x0;
83                                         x3 = x1;
84                                 }
85                         } else {
86                                 x1 = x0;
87                                 x2 = x0;
88                                 x3 = x0;
89                         }
90                 }
91                 e0 = x0;
92                 e1 = x1;
93                 e2 = x2;
94                 e3 = x3;
95                 x0 = _mm_xor_si128(x0, sk[0]);
96                 x1 = _mm_xor_si128(x1, sk[0]);
97                 x2 = _mm_xor_si128(x2, sk[0]);
98                 x3 = _mm_xor_si128(x3, sk[0]);
99                 x0 = _mm_aesdec_si128(x0, sk[1]);
100                 x1 = _mm_aesdec_si128(x1, sk[1]);
101                 x2 = _mm_aesdec_si128(x2, sk[1]);
102                 x3 = _mm_aesdec_si128(x3, sk[1]);
103                 x0 = _mm_aesdec_si128(x0, sk[2]);
104                 x1 = _mm_aesdec_si128(x1, sk[2]);
105                 x2 = _mm_aesdec_si128(x2, sk[2]);
106                 x3 = _mm_aesdec_si128(x3, sk[2]);
107                 x0 = _mm_aesdec_si128(x0, sk[3]);
108                 x1 = _mm_aesdec_si128(x1, sk[3]);
109                 x2 = _mm_aesdec_si128(x2, sk[3]);
110                 x3 = _mm_aesdec_si128(x3, sk[3]);
111                 x0 = _mm_aesdec_si128(x0, sk[4]);
112                 x1 = _mm_aesdec_si128(x1, sk[4]);
113                 x2 = _mm_aesdec_si128(x2, sk[4]);
114                 x3 = _mm_aesdec_si128(x3, sk[4]);
115                 x0 = _mm_aesdec_si128(x0, sk[5]);
116                 x1 = _mm_aesdec_si128(x1, sk[5]);
117                 x2 = _mm_aesdec_si128(x2, sk[5]);
118                 x3 = _mm_aesdec_si128(x3, sk[5]);
119                 x0 = _mm_aesdec_si128(x0, sk[6]);
120                 x1 = _mm_aesdec_si128(x1, sk[6]);
121                 x2 = _mm_aesdec_si128(x2, sk[6]);
122                 x3 = _mm_aesdec_si128(x3, sk[6]);
123                 x0 = _mm_aesdec_si128(x0, sk[7]);
124                 x1 = _mm_aesdec_si128(x1, sk[7]);
125                 x2 = _mm_aesdec_si128(x2, sk[7]);
126                 x3 = _mm_aesdec_si128(x3, sk[7]);
127                 x0 = _mm_aesdec_si128(x0, sk[8]);
128                 x1 = _mm_aesdec_si128(x1, sk[8]);
129                 x2 = _mm_aesdec_si128(x2, sk[8]);
130                 x3 = _mm_aesdec_si128(x3, sk[8]);
131                 x0 = _mm_aesdec_si128(x0, sk[9]);
132                 x1 = _mm_aesdec_si128(x1, sk[9]);
133                 x2 = _mm_aesdec_si128(x2, sk[9]);
134                 x3 = _mm_aesdec_si128(x3, sk[9]);
135                 if (num_rounds == 10) {
136                         x0 = _mm_aesdeclast_si128(x0, sk[10]);
137                         x1 = _mm_aesdeclast_si128(x1, sk[10]);
138                         x2 = _mm_aesdeclast_si128(x2, sk[10]);
139                         x3 = _mm_aesdeclast_si128(x3, sk[10]);
140                 } else if (num_rounds == 12) {
141                         x0 = _mm_aesdec_si128(x0, sk[10]);
142                         x1 = _mm_aesdec_si128(x1, sk[10]);
143                         x2 = _mm_aesdec_si128(x2, sk[10]);
144                         x3 = _mm_aesdec_si128(x3, sk[10]);
145                         x0 = _mm_aesdec_si128(x0, sk[11]);
146                         x1 = _mm_aesdec_si128(x1, sk[11]);
147                         x2 = _mm_aesdec_si128(x2, sk[11]);
148                         x3 = _mm_aesdec_si128(x3, sk[11]);
149                         x0 = _mm_aesdeclast_si128(x0, sk[12]);
150                         x1 = _mm_aesdeclast_si128(x1, sk[12]);
151                         x2 = _mm_aesdeclast_si128(x2, sk[12]);
152                         x3 = _mm_aesdeclast_si128(x3, sk[12]);
153                 } else {
154                         x0 = _mm_aesdec_si128(x0, sk[10]);
155                         x1 = _mm_aesdec_si128(x1, sk[10]);
156                         x2 = _mm_aesdec_si128(x2, sk[10]);
157                         x3 = _mm_aesdec_si128(x3, sk[10]);
158                         x0 = _mm_aesdec_si128(x0, sk[11]);
159                         x1 = _mm_aesdec_si128(x1, sk[11]);
160                         x2 = _mm_aesdec_si128(x2, sk[11]);
161                         x3 = _mm_aesdec_si128(x3, sk[11]);
162                         x0 = _mm_aesdec_si128(x0, sk[12]);
163                         x1 = _mm_aesdec_si128(x1, sk[12]);
164                         x2 = _mm_aesdec_si128(x2, sk[12]);
165                         x3 = _mm_aesdec_si128(x3, sk[12]);
166                         x0 = _mm_aesdec_si128(x0, sk[13]);
167                         x1 = _mm_aesdec_si128(x1, sk[13]);
168                         x2 = _mm_aesdec_si128(x2, sk[13]);
169                         x3 = _mm_aesdec_si128(x3, sk[13]);
170                         x0 = _mm_aesdeclast_si128(x0, sk[14]);
171                         x1 = _mm_aesdeclast_si128(x1, sk[14]);
172                         x2 = _mm_aesdeclast_si128(x2, sk[14]);
173                         x3 = _mm_aesdeclast_si128(x3, sk[14]);
174                 }
175                 x0 = _mm_xor_si128(x0, ivx);
176                 x1 = _mm_xor_si128(x1, e0);
177                 x2 = _mm_xor_si128(x2, e1);
178                 x3 = _mm_xor_si128(x3, e2);
179                 ivx = e3;
180                 _mm_storeu_si128((void *)(buf +  0), x0);
181                 if (len >= 64) {
182                         _mm_storeu_si128((void *)(buf + 16), x1);
183                         _mm_storeu_si128((void *)(buf + 32), x2);
184                         _mm_storeu_si128((void *)(buf + 48), x3);
185                         buf += 64;
186                         len -= 64;
187                 } else {
188                         if (len >= 32) {
189                                 _mm_storeu_si128((void *)(buf + 16), x1);
190                                 if (len >= 48) {
191                                         _mm_storeu_si128(
192                                                 (void *)(buf + 32), x2);
193                                 }
194                         }
195                         break;
196                 }
197         }
198         _mm_storeu_si128(iv, ivx);
199 }
200
201 BR_TARGETS_X86_DOWN
202
203 /* see bearssl_block.h */
204 const br_block_cbcdec_class br_aes_x86ni_cbcdec_vtable = {
205         sizeof(br_aes_x86ni_cbcdec_keys),
206         16,
207         4,
208         (void (*)(const br_block_cbcdec_class **, const void *, size_t))
209                 &br_aes_x86ni_cbcdec_init,
210         (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
211                 &br_aes_x86ni_cbcdec_run
212 };
213
214 #else
215
216 /* see bearssl_block.h */
217 const br_block_cbcdec_class *
218 br_aes_x86ni_cbcdec_get_vtable(void)
219 {
220         return NULL;
221 }
222
223 #endif