2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #define BR_ENABLE_INTRINSICS 1
31 * This file contains a ChaCha20 implementation that leverages SSE2
32 * opcodes for better performance.
35 /* see bearssl_block.h */
37 br_chacha20_sse2_get(void)
40 * If using 64-bit mode, then SSE2 opcodes should be automatically
41 * available, since they are part of the ABI.
43 * In 32-bit mode, we use CPUID to detect the SSE2 feature.
47 return &br_chacha20_sse2_run;
51 * SSE2 support is indicated by bit 26 in EDX.
53 if (br_cpuid(0, 0, 0, 0x04000000)) {
54 return &br_chacha20_sse2_run;
63 /* see bearssl_block.h */
66 br_chacha20_sse2_run(const void *key,
67 const void *iv, uint32_t cc, void *data, size_t len)
75 static const uint32_t CW[] = {
76 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
80 kw0 = _mm_loadu_si128(key);
81 kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
83 memcpy(ivtmp + 1, iv, 12);
84 iw = _mm_loadu_si128((const void *)ivtmp);
85 cw = _mm_loadu_si128((const void *)CW);
86 one = _mm_set_epi32(0, 0, 0, 1);
90 * sj contains state words 4*j to 4*j+3.
92 __m128i s0, s1, s2, s3;
99 for (i = 0; i < 10; i ++) {
101 * Even round is straightforward application on
104 s0 = _mm_add_epi32(s0, s1);
105 s3 = _mm_xor_si128(s3, s0);
107 _mm_slli_epi32(s3, 16),
108 _mm_srli_epi32(s3, 16));
110 s2 = _mm_add_epi32(s2, s3);
111 s1 = _mm_xor_si128(s1, s2);
113 _mm_slli_epi32(s1, 12),
114 _mm_srli_epi32(s1, 20));
116 s0 = _mm_add_epi32(s0, s1);
117 s3 = _mm_xor_si128(s3, s0);
119 _mm_slli_epi32(s3, 8),
120 _mm_srli_epi32(s3, 24));
122 s2 = _mm_add_epi32(s2, s3);
123 s1 = _mm_xor_si128(s1, s2);
125 _mm_slli_epi32(s1, 7),
126 _mm_srli_epi32(s1, 25));
129 * For the odd round, we must rotate some state
130 * words so that the computations apply on the
131 * right combinations of words.
133 s1 = _mm_shuffle_epi32(s1, 0x39);
134 s2 = _mm_shuffle_epi32(s2, 0x4E);
135 s3 = _mm_shuffle_epi32(s3, 0x93);
137 s0 = _mm_add_epi32(s0, s1);
138 s3 = _mm_xor_si128(s3, s0);
140 _mm_slli_epi32(s3, 16),
141 _mm_srli_epi32(s3, 16));
143 s2 = _mm_add_epi32(s2, s3);
144 s1 = _mm_xor_si128(s1, s2);
146 _mm_slli_epi32(s1, 12),
147 _mm_srli_epi32(s1, 20));
149 s0 = _mm_add_epi32(s0, s1);
150 s3 = _mm_xor_si128(s3, s0);
152 _mm_slli_epi32(s3, 8),
153 _mm_srli_epi32(s3, 24));
155 s2 = _mm_add_epi32(s2, s3);
156 s1 = _mm_xor_si128(s1, s2);
158 _mm_slli_epi32(s1, 7),
159 _mm_srli_epi32(s1, 25));
162 * After the odd round, we rotate back the values
163 * to undo the rotate at the start of the odd round.
165 s1 = _mm_shuffle_epi32(s1, 0x93);
166 s2 = _mm_shuffle_epi32(s2, 0x4E);
167 s3 = _mm_shuffle_epi32(s3, 0x39);
171 * Addition with the initial state.
173 s0 = _mm_add_epi32(s0, cw);
174 s1 = _mm_add_epi32(s1, kw0);
175 s2 = _mm_add_epi32(s2, kw1);
176 s3 = _mm_add_epi32(s3, iw);
179 * Increment block counter.
181 iw = _mm_add_epi32(iw, one);
184 * XOR final state with the data.
187 unsigned char tmp[64];
190 _mm_storeu_si128((void *)(tmp + 0), s0);
191 _mm_storeu_si128((void *)(tmp + 16), s1);
192 _mm_storeu_si128((void *)(tmp + 32), s2);
193 _mm_storeu_si128((void *)(tmp + 48), s3);
194 for (u = 0; u < len; u ++) {
199 __m128i b0, b1, b2, b3;
201 b0 = _mm_loadu_si128((const void *)(buf + 0));
202 b1 = _mm_loadu_si128((const void *)(buf + 16));
203 b2 = _mm_loadu_si128((const void *)(buf + 32));
204 b3 = _mm_loadu_si128((const void *)(buf + 48));
205 b0 = _mm_xor_si128(b0, s0);
206 b1 = _mm_xor_si128(b1, s1);
207 b2 = _mm_xor_si128(b2, s2);
208 b3 = _mm_xor_si128(b3, s3);
209 _mm_storeu_si128((void *)(buf + 0), b0);
210 _mm_storeu_si128((void *)(buf + 16), b1);
211 _mm_storeu_si128((void *)(buf + 32), b2);
212 _mm_storeu_si128((void *)(buf + 48), b3);
219 * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
220 * raw SSE2, thus we use _mm_extract_epi16().
222 return (uint32_t)_mm_extract_epi16(iw, 0)
223 | ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
230 /* see bearssl_block.h */
232 br_chacha20_sse2_get(void)