2 BLAKE2 reference source code package - optimized C implementations
4 Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
6 To the extent possible under law, the author(s) have dedicated all copyright
7 and related and neighboring rights to this software to the public domain
8 worldwide. This software is distributed without any warranty.
10 You should have received a copy of the CC0 Public Domain Dedication along with
11 this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
14 #ifndef __BLAKE2B_LOAD_SSE41_H__
15 #define __BLAKE2B_LOAD_SSE41_H__
17 #define LOAD_MSG_0_1(b0, b1) \
20 b0 = _mm_unpacklo_epi64(m0, m1); \
21 b1 = _mm_unpacklo_epi64(m2, m3); \
25 #define LOAD_MSG_0_2(b0, b1) \
28 b0 = _mm_unpackhi_epi64(m0, m1); \
29 b1 = _mm_unpackhi_epi64(m2, m3); \
33 #define LOAD_MSG_0_3(b0, b1) \
36 b0 = _mm_unpacklo_epi64(m4, m5); \
37 b1 = _mm_unpacklo_epi64(m6, m7); \
41 #define LOAD_MSG_0_4(b0, b1) \
44 b0 = _mm_unpackhi_epi64(m4, m5); \
45 b1 = _mm_unpackhi_epi64(m6, m7); \
49 #define LOAD_MSG_1_1(b0, b1) \
52 b0 = _mm_unpacklo_epi64(m7, m2); \
53 b1 = _mm_unpackhi_epi64(m4, m6); \
57 #define LOAD_MSG_1_2(b0, b1) \
60 b0 = _mm_unpacklo_epi64(m5, m4); \
61 b1 = _mm_alignr_epi8(m3, m7, 8); \
65 #define LOAD_MSG_1_3(b0, b1) \
68 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
69 b1 = _mm_unpackhi_epi64(m5, m2); \
73 #define LOAD_MSG_1_4(b0, b1) \
76 b0 = _mm_unpacklo_epi64(m6, m1); \
77 b1 = _mm_unpackhi_epi64(m3, m1); \
81 #define LOAD_MSG_2_1(b0, b1) \
84 b0 = _mm_alignr_epi8(m6, m5, 8); \
85 b1 = _mm_unpackhi_epi64(m2, m7); \
89 #define LOAD_MSG_2_2(b0, b1) \
92 b0 = _mm_unpacklo_epi64(m4, m0); \
93 b1 = _mm_blend_epi16(m1, m6, 0xF0); \
97 #define LOAD_MSG_2_3(b0, b1) \
100 b0 = _mm_blend_epi16(m5, m1, 0xF0); \
101 b1 = _mm_unpackhi_epi64(m3, m4); \
105 #define LOAD_MSG_2_4(b0, b1) \
108 b0 = _mm_unpacklo_epi64(m7, m3); \
109 b1 = _mm_alignr_epi8(m2, m0, 8); \
113 #define LOAD_MSG_3_1(b0, b1) \
116 b0 = _mm_unpackhi_epi64(m3, m1); \
117 b1 = _mm_unpackhi_epi64(m6, m5); \
121 #define LOAD_MSG_3_2(b0, b1) \
124 b0 = _mm_unpackhi_epi64(m4, m0); \
125 b1 = _mm_unpacklo_epi64(m6, m7); \
129 #define LOAD_MSG_3_3(b0, b1) \
132 b0 = _mm_blend_epi16(m1, m2, 0xF0); \
133 b1 = _mm_blend_epi16(m2, m7, 0xF0); \
137 #define LOAD_MSG_3_4(b0, b1) \
140 b0 = _mm_unpacklo_epi64(m3, m5); \
141 b1 = _mm_unpacklo_epi64(m0, m4); \
145 #define LOAD_MSG_4_1(b0, b1) \
148 b0 = _mm_unpackhi_epi64(m4, m2); \
149 b1 = _mm_unpacklo_epi64(m1, m5); \
153 #define LOAD_MSG_4_2(b0, b1) \
156 b0 = _mm_blend_epi16(m0, m3, 0xF0); \
157 b1 = _mm_blend_epi16(m2, m7, 0xF0); \
161 #define LOAD_MSG_4_3(b0, b1) \
164 b0 = _mm_blend_epi16(m7, m5, 0xF0); \
165 b1 = _mm_blend_epi16(m3, m1, 0xF0); \
169 #define LOAD_MSG_4_4(b0, b1) \
172 b0 = _mm_alignr_epi8(m6, m0, 8); \
173 b1 = _mm_blend_epi16(m4, m6, 0xF0); \
177 #define LOAD_MSG_5_1(b0, b1) \
180 b0 = _mm_unpacklo_epi64(m1, m3); \
181 b1 = _mm_unpacklo_epi64(m0, m4); \
185 #define LOAD_MSG_5_2(b0, b1) \
188 b0 = _mm_unpacklo_epi64(m6, m5); \
189 b1 = _mm_unpackhi_epi64(m5, m1); \
193 #define LOAD_MSG_5_3(b0, b1) \
196 b0 = _mm_blend_epi16(m2, m3, 0xF0); \
197 b1 = _mm_unpackhi_epi64(m7, m0); \
201 #define LOAD_MSG_5_4(b0, b1) \
204 b0 = _mm_unpackhi_epi64(m6, m2); \
205 b1 = _mm_blend_epi16(m7, m4, 0xF0); \
209 #define LOAD_MSG_6_1(b0, b1) \
212 b0 = _mm_blend_epi16(m6, m0, 0xF0); \
213 b1 = _mm_unpacklo_epi64(m7, m2); \
217 #define LOAD_MSG_6_2(b0, b1) \
220 b0 = _mm_unpackhi_epi64(m2, m7); \
221 b1 = _mm_alignr_epi8(m5, m6, 8); \
225 #define LOAD_MSG_6_3(b0, b1) \
228 b0 = _mm_unpacklo_epi64(m0, m3); \
229 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
233 #define LOAD_MSG_6_4(b0, b1) \
236 b0 = _mm_unpackhi_epi64(m3, m1); \
237 b1 = _mm_blend_epi16(m1, m5, 0xF0); \
241 #define LOAD_MSG_7_1(b0, b1) \
244 b0 = _mm_unpackhi_epi64(m6, m3); \
245 b1 = _mm_blend_epi16(m6, m1, 0xF0); \
249 #define LOAD_MSG_7_2(b0, b1) \
252 b0 = _mm_alignr_epi8(m7, m5, 8); \
253 b1 = _mm_unpackhi_epi64(m0, m4); \
257 #define LOAD_MSG_7_3(b0, b1) \
260 b0 = _mm_unpackhi_epi64(m2, m7); \
261 b1 = _mm_unpacklo_epi64(m4, m1); \
265 #define LOAD_MSG_7_4(b0, b1) \
268 b0 = _mm_unpacklo_epi64(m0, m2); \
269 b1 = _mm_unpacklo_epi64(m3, m5); \
273 #define LOAD_MSG_8_1(b0, b1) \
276 b0 = _mm_unpacklo_epi64(m3, m7); \
277 b1 = _mm_alignr_epi8(m0, m5, 8); \
281 #define LOAD_MSG_8_2(b0, b1) \
284 b0 = _mm_unpackhi_epi64(m7, m4); \
285 b1 = _mm_alignr_epi8(m4, m1, 8); \
289 #define LOAD_MSG_8_3(b0, b1) \
293 b1 = _mm_alignr_epi8(m5, m0, 8); \
297 #define LOAD_MSG_8_4(b0, b1) \
300 b0 = _mm_blend_epi16(m1, m3, 0xF0); \
305 #define LOAD_MSG_9_1(b0, b1) \
308 b0 = _mm_unpacklo_epi64(m5, m4); \
309 b1 = _mm_unpackhi_epi64(m3, m0); \
313 #define LOAD_MSG_9_2(b0, b1) \
316 b0 = _mm_unpacklo_epi64(m1, m2); \
317 b1 = _mm_blend_epi16(m3, m2, 0xF0); \
321 #define LOAD_MSG_9_3(b0, b1) \
324 b0 = _mm_unpackhi_epi64(m7, m4); \
325 b1 = _mm_unpackhi_epi64(m1, m6); \
329 #define LOAD_MSG_9_4(b0, b1) \
332 b0 = _mm_alignr_epi8(m7, m5, 8); \
333 b1 = _mm_unpacklo_epi64(m6, m0); \
337 #define LOAD_MSG_10_1(b0, b1) \
340 b0 = _mm_unpacklo_epi64(m0, m1); \
341 b1 = _mm_unpacklo_epi64(m2, m3); \
345 #define LOAD_MSG_10_2(b0, b1) \
348 b0 = _mm_unpackhi_epi64(m0, m1); \
349 b1 = _mm_unpackhi_epi64(m2, m3); \
353 #define LOAD_MSG_10_3(b0, b1) \
356 b0 = _mm_unpacklo_epi64(m4, m5); \
357 b1 = _mm_unpacklo_epi64(m6, m7); \
361 #define LOAD_MSG_10_4(b0, b1) \
364 b0 = _mm_unpackhi_epi64(m4, m5); \
365 b1 = _mm_unpackhi_epi64(m6, m7); \
369 #define LOAD_MSG_11_1(b0, b1) \
372 b0 = _mm_unpacklo_epi64(m7, m2); \
373 b1 = _mm_unpackhi_epi64(m4, m6); \
377 #define LOAD_MSG_11_2(b0, b1) \
380 b0 = _mm_unpacklo_epi64(m5, m4); \
381 b1 = _mm_alignr_epi8(m3, m7, 8); \
385 #define LOAD_MSG_11_3(b0, b1) \
388 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
389 b1 = _mm_unpackhi_epi64(m5, m2); \
393 #define LOAD_MSG_11_4(b0, b1) \
396 b0 = _mm_unpacklo_epi64(m6, m1); \
397 b1 = _mm_unpackhi_epi64(m3, m1); \