]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bearssl/src/symcipher/aes_x86ni_ctr.c
MFV r357608: Limit memory usage in xz(1) instead of in tuklib.
[FreeBSD/FreeBSD.git] / contrib / bearssl / src / symcipher / aes_x86ni_ctr.c
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining 
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be 
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24
25 #define BR_ENABLE_INTRINSICS   1
26 #include "inner.h"
27
28 #if BR_AES_X86NI
29
30 /* see bearssl_block.h */
31 const br_block_ctr_class *
32 br_aes_x86ni_ctr_get_vtable(void)
33 {
34         return br_aes_x86ni_supported() ? &br_aes_x86ni_ctr_vtable : NULL;
35 }
36
37 /* see bearssl_block.h */
38 void
39 br_aes_x86ni_ctr_init(br_aes_x86ni_ctr_keys *ctx,
40         const void *key, size_t len)
41 {
42         ctx->vtable = &br_aes_x86ni_ctr_vtable;
43         ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
44 }
45
46 BR_TARGETS_X86_UP
47
48 /* see bearssl_block.h */
49 BR_TARGET("sse2,sse4.1,aes")
50 uint32_t
51 br_aes_x86ni_ctr_run(const br_aes_x86ni_ctr_keys *ctx,
52         const void *iv, uint32_t cc, void *data, size_t len)
53 {
54         unsigned char *buf;
55         unsigned char ivbuf[16];
56         unsigned num_rounds;
57         __m128i sk[15];
58         __m128i ivx;
59         unsigned u;
60
61         buf = data;
62         memcpy(ivbuf, iv, 12);
63         num_rounds = ctx->num_rounds;
64         for (u = 0; u <= num_rounds; u ++) {
65                 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
66         }
67         ivx = _mm_loadu_si128((void *)ivbuf);
68         while (len > 0) {
69                 __m128i x0, x1, x2, x3;
70
71                 x0 = _mm_insert_epi32(ivx, br_bswap32(cc + 0), 3);
72                 x1 = _mm_insert_epi32(ivx, br_bswap32(cc + 1), 3);
73                 x2 = _mm_insert_epi32(ivx, br_bswap32(cc + 2), 3);
74                 x3 = _mm_insert_epi32(ivx, br_bswap32(cc + 3), 3);
75                 x0 = _mm_xor_si128(x0, sk[0]);
76                 x1 = _mm_xor_si128(x1, sk[0]);
77                 x2 = _mm_xor_si128(x2, sk[0]);
78                 x3 = _mm_xor_si128(x3, sk[0]);
79                 x0 = _mm_aesenc_si128(x0, sk[1]);
80                 x1 = _mm_aesenc_si128(x1, sk[1]);
81                 x2 = _mm_aesenc_si128(x2, sk[1]);
82                 x3 = _mm_aesenc_si128(x3, sk[1]);
83                 x0 = _mm_aesenc_si128(x0, sk[2]);
84                 x1 = _mm_aesenc_si128(x1, sk[2]);
85                 x2 = _mm_aesenc_si128(x2, sk[2]);
86                 x3 = _mm_aesenc_si128(x3, sk[2]);
87                 x0 = _mm_aesenc_si128(x0, sk[3]);
88                 x1 = _mm_aesenc_si128(x1, sk[3]);
89                 x2 = _mm_aesenc_si128(x2, sk[3]);
90                 x3 = _mm_aesenc_si128(x3, sk[3]);
91                 x0 = _mm_aesenc_si128(x0, sk[4]);
92                 x1 = _mm_aesenc_si128(x1, sk[4]);
93                 x2 = _mm_aesenc_si128(x2, sk[4]);
94                 x3 = _mm_aesenc_si128(x3, sk[4]);
95                 x0 = _mm_aesenc_si128(x0, sk[5]);
96                 x1 = _mm_aesenc_si128(x1, sk[5]);
97                 x2 = _mm_aesenc_si128(x2, sk[5]);
98                 x3 = _mm_aesenc_si128(x3, sk[5]);
99                 x0 = _mm_aesenc_si128(x0, sk[6]);
100                 x1 = _mm_aesenc_si128(x1, sk[6]);
101                 x2 = _mm_aesenc_si128(x2, sk[6]);
102                 x3 = _mm_aesenc_si128(x3, sk[6]);
103                 x0 = _mm_aesenc_si128(x0, sk[7]);
104                 x1 = _mm_aesenc_si128(x1, sk[7]);
105                 x2 = _mm_aesenc_si128(x2, sk[7]);
106                 x3 = _mm_aesenc_si128(x3, sk[7]);
107                 x0 = _mm_aesenc_si128(x0, sk[8]);
108                 x1 = _mm_aesenc_si128(x1, sk[8]);
109                 x2 = _mm_aesenc_si128(x2, sk[8]);
110                 x3 = _mm_aesenc_si128(x3, sk[8]);
111                 x0 = _mm_aesenc_si128(x0, sk[9]);
112                 x1 = _mm_aesenc_si128(x1, sk[9]);
113                 x2 = _mm_aesenc_si128(x2, sk[9]);
114                 x3 = _mm_aesenc_si128(x3, sk[9]);
115                 if (num_rounds == 10) {
116                         x0 = _mm_aesenclast_si128(x0, sk[10]);
117                         x1 = _mm_aesenclast_si128(x1, sk[10]);
118                         x2 = _mm_aesenclast_si128(x2, sk[10]);
119                         x3 = _mm_aesenclast_si128(x3, sk[10]);
120                 } else if (num_rounds == 12) {
121                         x0 = _mm_aesenc_si128(x0, sk[10]);
122                         x1 = _mm_aesenc_si128(x1, sk[10]);
123                         x2 = _mm_aesenc_si128(x2, sk[10]);
124                         x3 = _mm_aesenc_si128(x3, sk[10]);
125                         x0 = _mm_aesenc_si128(x0, sk[11]);
126                         x1 = _mm_aesenc_si128(x1, sk[11]);
127                         x2 = _mm_aesenc_si128(x2, sk[11]);
128                         x3 = _mm_aesenc_si128(x3, sk[11]);
129                         x0 = _mm_aesenclast_si128(x0, sk[12]);
130                         x1 = _mm_aesenclast_si128(x1, sk[12]);
131                         x2 = _mm_aesenclast_si128(x2, sk[12]);
132                         x3 = _mm_aesenclast_si128(x3, sk[12]);
133                 } else {
134                         x0 = _mm_aesenc_si128(x0, sk[10]);
135                         x1 = _mm_aesenc_si128(x1, sk[10]);
136                         x2 = _mm_aesenc_si128(x2, sk[10]);
137                         x3 = _mm_aesenc_si128(x3, sk[10]);
138                         x0 = _mm_aesenc_si128(x0, sk[11]);
139                         x1 = _mm_aesenc_si128(x1, sk[11]);
140                         x2 = _mm_aesenc_si128(x2, sk[11]);
141                         x3 = _mm_aesenc_si128(x3, sk[11]);
142                         x0 = _mm_aesenc_si128(x0, sk[12]);
143                         x1 = _mm_aesenc_si128(x1, sk[12]);
144                         x2 = _mm_aesenc_si128(x2, sk[12]);
145                         x3 = _mm_aesenc_si128(x3, sk[12]);
146                         x0 = _mm_aesenc_si128(x0, sk[13]);
147                         x1 = _mm_aesenc_si128(x1, sk[13]);
148                         x2 = _mm_aesenc_si128(x2, sk[13]);
149                         x3 = _mm_aesenc_si128(x3, sk[13]);
150                         x0 = _mm_aesenclast_si128(x0, sk[14]);
151                         x1 = _mm_aesenclast_si128(x1, sk[14]);
152                         x2 = _mm_aesenclast_si128(x2, sk[14]);
153                         x3 = _mm_aesenclast_si128(x3, sk[14]);
154                 }
155                 if (len >= 64) {
156                         x0 = _mm_xor_si128(x0,
157                                 _mm_loadu_si128((void *)(buf +  0)));
158                         x1 = _mm_xor_si128(x1,
159                                 _mm_loadu_si128((void *)(buf + 16)));
160                         x2 = _mm_xor_si128(x2,
161                                 _mm_loadu_si128((void *)(buf + 32)));
162                         x3 = _mm_xor_si128(x3,
163                                 _mm_loadu_si128((void *)(buf + 48)));
164                         _mm_storeu_si128((void *)(buf +  0), x0);
165                         _mm_storeu_si128((void *)(buf + 16), x1);
166                         _mm_storeu_si128((void *)(buf + 32), x2);
167                         _mm_storeu_si128((void *)(buf + 48), x3);
168                         buf += 64;
169                         len -= 64;
170                         cc += 4;
171                 } else {
172                         unsigned char tmp[64];
173
174                         _mm_storeu_si128((void *)(tmp +  0), x0);
175                         _mm_storeu_si128((void *)(tmp + 16), x1);
176                         _mm_storeu_si128((void *)(tmp + 32), x2);
177                         _mm_storeu_si128((void *)(tmp + 48), x3);
178                         for (u = 0; u < len; u ++) {
179                                 buf[u] ^= tmp[u];
180                         }
181                         cc += (uint32_t)len >> 4;
182                         break;
183                 }
184         }
185         return cc;
186 }
187
188 BR_TARGETS_X86_DOWN
189
190 /* see bearssl_block.h */
191 const br_block_ctr_class br_aes_x86ni_ctr_vtable = {
192         sizeof(br_aes_x86ni_ctr_keys),
193         16,
194         4,
195         (void (*)(const br_block_ctr_class **, const void *, size_t))
196                 &br_aes_x86ni_ctr_init,
197         (uint32_t (*)(const br_block_ctr_class *const *,
198                 const void *, uint32_t, void *, size_t))
199                 &br_aes_x86ni_ctr_run
200 };
201
202 #else
203
204 /* see bearssl_block.h */
205 const br_block_ctr_class *
206 br_aes_x86ni_ctr_get_vtable(void)
207 {
208         return NULL;
209 }
210
211 #endif