]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bearssl/src/symcipher/chacha20_sse2.c
MFV r350080:
[FreeBSD/FreeBSD.git] / contrib / bearssl / src / symcipher / chacha20_sse2.c
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining 
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be 
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24
25 #define BR_ENABLE_INTRINSICS   1
26 #include "inner.h"
27
28 #if BR_SSE2
29
30 /*
31  * This file contains a ChaCha20 implementation that leverages SSE2
32  * opcodes for better performance.
33  */
34
35 /* see bearssl_block.h */
36 br_chacha20_run
37 br_chacha20_sse2_get(void)
38 {
39         /*
40          * If using 64-bit mode, then SSE2 opcodes should be automatically
41          * available, since they are part of the ABI.
42          *
43          * In 32-bit mode, we use CPUID to detect the SSE2 feature.
44          */
45
46 #if BR_amd64
47         return &br_chacha20_sse2_run;
48 #else
49
50         /*
51          * SSE2 support is indicated by bit 26 in EDX.
52          */
53         if (br_cpuid(0, 0, 0, 0x04000000)) {
54                 return &br_chacha20_sse2_run;
55         } else {
56                 return 0;
57         }
58 #endif
59 }
60
61 BR_TARGETS_X86_UP
62
63 /* see bearssl_block.h */
64 BR_TARGET("sse2")
65 uint32_t
66 br_chacha20_sse2_run(const void *key,
67         const void *iv, uint32_t cc, void *data, size_t len)
68 {
69         unsigned char *buf;
70         uint32_t ivtmp[4];
71         __m128i kw0, kw1;
72         __m128i iw, cw;
73         __m128i one;
74
75         static const uint32_t CW[] = {
76                 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
77         };
78
79         buf = data;
80         kw0 = _mm_loadu_si128(key);
81         kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
82         ivtmp[0] = cc;
83         memcpy(ivtmp + 1, iv, 12);
84         iw = _mm_loadu_si128((const void *)ivtmp);
85         cw = _mm_loadu_si128((const void *)CW);
86         one = _mm_set_epi32(0, 0, 0, 1);
87
88         while (len > 0) {
89                 /*
90                  * sj contains state words 4*j to 4*j+3.
91                  */
92                 __m128i s0, s1, s2, s3;
93                 int i;
94
95                 s0 = cw;
96                 s1 = kw0;
97                 s2 = kw1;
98                 s3 = iw;
99                 for (i = 0; i < 10; i ++) {
100                         /*
101                          * Even round is straightforward application on
102                          * the state words.
103                          */
104                         s0 = _mm_add_epi32(s0, s1);
105                         s3 = _mm_xor_si128(s3, s0);
106                         s3 = _mm_or_si128(
107                                 _mm_slli_epi32(s3, 16),
108                                 _mm_srli_epi32(s3, 16));
109
110                         s2 = _mm_add_epi32(s2, s3);
111                         s1 = _mm_xor_si128(s1, s2);
112                         s1 = _mm_or_si128(
113                                 _mm_slli_epi32(s1, 12),
114                                 _mm_srli_epi32(s1, 20));
115
116                         s0 = _mm_add_epi32(s0, s1);
117                         s3 = _mm_xor_si128(s3, s0);
118                         s3 = _mm_or_si128(
119                                 _mm_slli_epi32(s3, 8),
120                                 _mm_srli_epi32(s3, 24));
121
122                         s2 = _mm_add_epi32(s2, s3);
123                         s1 = _mm_xor_si128(s1, s2);
124                         s1 = _mm_or_si128(
125                                 _mm_slli_epi32(s1, 7),
126                                 _mm_srli_epi32(s1, 25));
127
128                         /*
129                          * For the odd round, we must rotate some state
130                          * words so that the computations apply on the
131                          * right combinations of words.
132                          */
133                         s1 = _mm_shuffle_epi32(s1, 0x39);
134                         s2 = _mm_shuffle_epi32(s2, 0x4E);
135                         s3 = _mm_shuffle_epi32(s3, 0x93);
136
137                         s0 = _mm_add_epi32(s0, s1);
138                         s3 = _mm_xor_si128(s3, s0);
139                         s3 = _mm_or_si128(
140                                 _mm_slli_epi32(s3, 16),
141                                 _mm_srli_epi32(s3, 16));
142
143                         s2 = _mm_add_epi32(s2, s3);
144                         s1 = _mm_xor_si128(s1, s2);
145                         s1 = _mm_or_si128(
146                                 _mm_slli_epi32(s1, 12),
147                                 _mm_srli_epi32(s1, 20));
148
149                         s0 = _mm_add_epi32(s0, s1);
150                         s3 = _mm_xor_si128(s3, s0);
151                         s3 = _mm_or_si128(
152                                 _mm_slli_epi32(s3, 8),
153                                 _mm_srli_epi32(s3, 24));
154
155                         s2 = _mm_add_epi32(s2, s3);
156                         s1 = _mm_xor_si128(s1, s2);
157                         s1 = _mm_or_si128(
158                                 _mm_slli_epi32(s1, 7),
159                                 _mm_srli_epi32(s1, 25));
160
161                         /*
162                          * After the odd round, we rotate back the values
163                          * to undo the rotate at the start of the odd round.
164                          */
165                         s1 = _mm_shuffle_epi32(s1, 0x93);
166                         s2 = _mm_shuffle_epi32(s2, 0x4E);
167                         s3 = _mm_shuffle_epi32(s3, 0x39);
168                 }
169
170                 /*
171                  * Addition with the initial state.
172                  */
173                 s0 = _mm_add_epi32(s0, cw);
174                 s1 = _mm_add_epi32(s1, kw0);
175                 s2 = _mm_add_epi32(s2, kw1);
176                 s3 = _mm_add_epi32(s3, iw);
177
178                 /*
179                  * Increment block counter.
180                  */
181                 iw = _mm_add_epi32(iw, one);
182
183                 /*
184                  * XOR final state with the data.
185                  */
186                 if (len < 64) {
187                         unsigned char tmp[64];
188                         size_t u;
189
190                         _mm_storeu_si128((void *)(tmp +  0), s0);
191                         _mm_storeu_si128((void *)(tmp + 16), s1);
192                         _mm_storeu_si128((void *)(tmp + 32), s2);
193                         _mm_storeu_si128((void *)(tmp + 48), s3);
194                         for (u = 0; u < len; u ++) {
195                                 buf[u] ^= tmp[u];
196                         }
197                         break;
198                 } else {
199                         __m128i b0, b1, b2, b3;
200
201                         b0 = _mm_loadu_si128((const void *)(buf +  0));
202                         b1 = _mm_loadu_si128((const void *)(buf + 16));
203                         b2 = _mm_loadu_si128((const void *)(buf + 32));
204                         b3 = _mm_loadu_si128((const void *)(buf + 48));
205                         b0 = _mm_xor_si128(b0, s0);
206                         b1 = _mm_xor_si128(b1, s1);
207                         b2 = _mm_xor_si128(b2, s2);
208                         b3 = _mm_xor_si128(b3, s3);
209                         _mm_storeu_si128((void *)(buf +  0), b0);
210                         _mm_storeu_si128((void *)(buf + 16), b1);
211                         _mm_storeu_si128((void *)(buf + 32), b2);
212                         _mm_storeu_si128((void *)(buf + 48), b3);
213                         buf += 64;
214                         len -= 64;
215                 }
216         }
217
218         /*
219          * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
220          * raw SSE2, thus we use _mm_extract_epi16().
221          */
222         return (uint32_t)_mm_extract_epi16(iw, 0)
223                 | ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
224 }
225
226 BR_TARGETS_X86_DOWN
227
228 #else
229
230 /* see bearssl_block.h */
231 br_chacha20_run
232 br_chacha20_sse2_get(void)
233 {
234         return 0;
235 }
236
237 #endif