contrib/bearssl/src/symcipher/chacha20_sse2.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #define BR_ENABLE_INTRINSICS   1
  26 #include "inner.h"
  27
  28 #if BR_SSE2
  29
  30 /*
  31  * This file contains a ChaCha20 implementation that leverages SSE2
  32  * opcodes for better performance.
  33  */
  34
  35 /* see bearssl_block.h */
  36 br_chacha20_run
  37 br_chacha20_sse2_get(void)
  38 {
  39         /*
  40          * If using 64-bit mode, then SSE2 opcodes should be automatically
  41          * available, since they are part of the ABI.
  42          *
  43          * In 32-bit mode, we use CPUID to detect the SSE2 feature.
  44          */
  45
  46 #if BR_amd64
  47         return &br_chacha20_sse2_run;
  48 #else
  49
  50         /*
  51          * SSE2 support is indicated by bit 26 in EDX.
  52          */
  53         if (br_cpuid(0, 0, 0, 0x04000000)) {
  54                 return &br_chacha20_sse2_run;
  55         } else {
  56                 return 0;
  57         }
  58 #endif
  59 }
  60
  61 BR_TARGETS_X86_UP
  62
  63 /* see bearssl_block.h */
  64 BR_TARGET("sse2")
  65 uint32_t
  66 br_chacha20_sse2_run(const void *key,
  67         const void *iv, uint32_t cc, void *data, size_t len)
  68 {
  69         unsigned char *buf;
  70         uint32_t ivtmp[4];
  71         __m128i kw0, kw1;
  72         __m128i iw, cw;
  73         __m128i one;
  74
  75         static const uint32_t CW[] = {
  76                 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
  77         };
  78
  79         buf = data;
  80         kw0 = _mm_loadu_si128(key);
  81         kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
  82         ivtmp[0] = cc;
  83         memcpy(ivtmp + 1, iv, 12);
  84         iw = _mm_loadu_si128((const void *)ivtmp);
  85         cw = _mm_loadu_si128((const void *)CW);
  86         one = _mm_set_epi32(0, 0, 0, 1);
  87
  88         while (len > 0) {
  89                 /*
  90                  * sj contains state words 4*j to 4*j+3.
  91                  */
  92                 __m128i s0, s1, s2, s3;
  93                 int i;
  94
  95                 s0 = cw;
  96                 s1 = kw0;
  97                 s2 = kw1;
  98                 s3 = iw;
  99                 for (i = 0; i < 10; i ++) {
 100                         /*
 101                          * Even round is straightforward application on
 102                          * the state words.
 103                          */
 104                         s0 = _mm_add_epi32(s0, s1);
 105                         s3 = _mm_xor_si128(s3, s0);
 106                         s3 = _mm_or_si128(
 107                                 _mm_slli_epi32(s3, 16),
 108                                 _mm_srli_epi32(s3, 16));
 109
 110                         s2 = _mm_add_epi32(s2, s3);
 111                         s1 = _mm_xor_si128(s1, s2);
 112                         s1 = _mm_or_si128(
 113                                 _mm_slli_epi32(s1, 12),
 114                                 _mm_srli_epi32(s1, 20));
 115
 116                         s0 = _mm_add_epi32(s0, s1);
 117                         s3 = _mm_xor_si128(s3, s0);
 118                         s3 = _mm_or_si128(
 119                                 _mm_slli_epi32(s3, 8),
 120                                 _mm_srli_epi32(s3, 24));
 121
 122                         s2 = _mm_add_epi32(s2, s3);
 123                         s1 = _mm_xor_si128(s1, s2);
 124                         s1 = _mm_or_si128(
 125                                 _mm_slli_epi32(s1, 7),
 126                                 _mm_srli_epi32(s1, 25));
 127
 128                         /*
 129                          * For the odd round, we must rotate some state
 130                          * words so that the computations apply on the
 131                          * right combinations of words.
 132                          */
 133                         s1 = _mm_shuffle_epi32(s1, 0x39);
 134                         s2 = _mm_shuffle_epi32(s2, 0x4E);
 135                         s3 = _mm_shuffle_epi32(s3, 0x93);
 136
 137                         s0 = _mm_add_epi32(s0, s1);
 138                         s3 = _mm_xor_si128(s3, s0);
 139                         s3 = _mm_or_si128(
 140                                 _mm_slli_epi32(s3, 16),
 141                                 _mm_srli_epi32(s3, 16));
 142
 143                         s2 = _mm_add_epi32(s2, s3);
 144                         s1 = _mm_xor_si128(s1, s2);
 145                         s1 = _mm_or_si128(
 146                                 _mm_slli_epi32(s1, 12),
 147                                 _mm_srli_epi32(s1, 20));
 148
 149                         s0 = _mm_add_epi32(s0, s1);
 150                         s3 = _mm_xor_si128(s3, s0);
 151                         s3 = _mm_or_si128(
 152                                 _mm_slli_epi32(s3, 8),
 153                                 _mm_srli_epi32(s3, 24));
 154
 155                         s2 = _mm_add_epi32(s2, s3);
 156                         s1 = _mm_xor_si128(s1, s2);
 157                         s1 = _mm_or_si128(
 158                                 _mm_slli_epi32(s1, 7),
 159                                 _mm_srli_epi32(s1, 25));
 160
 161                         /*
 162                          * After the odd round, we rotate back the values
 163                          * to undo the rotate at the start of the odd round.
 164                          */
 165                         s1 = _mm_shuffle_epi32(s1, 0x93);
 166                         s2 = _mm_shuffle_epi32(s2, 0x4E);
 167                         s3 = _mm_shuffle_epi32(s3, 0x39);
 168                 }
 169
 170                 /*
 171                  * Addition with the initial state.
 172                  */
 173                 s0 = _mm_add_epi32(s0, cw);
 174                 s1 = _mm_add_epi32(s1, kw0);
 175                 s2 = _mm_add_epi32(s2, kw1);
 176                 s3 = _mm_add_epi32(s3, iw);
 177
 178                 /*
 179                  * Increment block counter.
 180                  */
 181                 iw = _mm_add_epi32(iw, one);
 182
 183                 /*
 184                  * XOR final state with the data.
 185                  */
 186                 if (len < 64) {
 187                         unsigned char tmp[64];
 188                         size_t u;
 189
 190                         _mm_storeu_si128((void *)(tmp +  0), s0);
 191                         _mm_storeu_si128((void *)(tmp + 16), s1);
 192                         _mm_storeu_si128((void *)(tmp + 32), s2);
 193                         _mm_storeu_si128((void *)(tmp + 48), s3);
 194                         for (u = 0; u < len; u ++) {
 195                                 buf[u] ^= tmp[u];
 196                         }
 197                         break;
 198                 } else {
 199                         __m128i b0, b1, b2, b3;
 200
 201                         b0 = _mm_loadu_si128((const void *)(buf +  0));
 202                         b1 = _mm_loadu_si128((const void *)(buf + 16));
 203                         b2 = _mm_loadu_si128((const void *)(buf + 32));
 204                         b3 = _mm_loadu_si128((const void *)(buf + 48));
 205                         b0 = _mm_xor_si128(b0, s0);
 206                         b1 = _mm_xor_si128(b1, s1);
 207                         b2 = _mm_xor_si128(b2, s2);
 208                         b3 = _mm_xor_si128(b3, s3);
 209                         _mm_storeu_si128((void *)(buf +  0), b0);
 210                         _mm_storeu_si128((void *)(buf + 16), b1);
 211                         _mm_storeu_si128((void *)(buf + 32), b2);
 212                         _mm_storeu_si128((void *)(buf + 48), b3);
 213                         buf += 64;
 214                         len -= 64;
 215                 }
 216         }
 217
 218         /*
 219          * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
 220          * raw SSE2, thus we use _mm_extract_epi16().
 221          */
 222         return (uint32_t)_mm_extract_epi16(iw, 0)
 223                 | ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
 224 }
 225
 226 BR_TARGETS_X86_DOWN
 227
 228 #else
 229
 230 /* see bearssl_block.h */
 231 br_chacha20_run
 232 br_chacha20_sse2_get(void)
 233 {
 234         return 0;
 235 }
 236
 237 #endif