contrib/bearssl/src/symcipher/poly1305_ctmul32.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * Perform the inner processing of blocks for Poly1305.
  29  */
  30 static void
  31 poly1305_inner(uint32_t *a, const uint32_t *r, const void *data, size_t len)
  32 {
  33         /*
  34          * Implementation notes: we split the 130-bit values into ten
  35          * 13-bit words. This gives us some space for carries and allows
  36          * using only 32x32->32 multiplications, which are way faster than
  37          * 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also
  38          * help in making constant-time code on the Cortex-M3.
  39          *
  40          * Since we compute modulo 2^130-5, the "upper words" become
  41          * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
  42          * This has already been integrated in the r[] array, which
  43          * is extended to the 0..18 range.
  44          *
  45          * In each loop iteration, a[] and r[] words are 13-bit each,
  46          * except a[1] which may use 14 bits.
  47          */
  48         const unsigned char *buf;
  49
  50         buf = data;
  51         while (len > 0) {
  52                 unsigned char tmp[16];
  53                 uint32_t b[10];
  54                 unsigned u, v;
  55                 uint32_t z, cc1, cc2;
  56
  57                 /*
  58                  * If there is a partial block, right-pad it with zeros.
  59                  */
  60                 if (len < 16) {
  61                         memset(tmp, 0, sizeof tmp);
  62                         memcpy(tmp, buf, len);
  63                         buf = tmp;
  64                         len = 16;
  65                 }
  66
  67                 /*
  68                  * Decode next block and apply the "high bit"; that value
  69                  * is added to the accumulator.
  70                  */
  71                 v = br_dec16le(buf);
  72                 a[0] += v & 0x01FFF;
  73                 v >>= 13;
  74                 v |= buf[2] << 3;
  75                 v |= buf[3] << 11;
  76                 a[1] += v & 0x01FFF;
  77                 v >>= 13;
  78                 v |= buf[4] << 6;
  79                 a[2] += v & 0x01FFF;
  80                 v >>= 13;
  81                 v |= buf[5] << 1;
  82                 v |= buf[6] << 9;
  83                 a[3] += v & 0x01FFF;
  84                 v >>= 13;
  85                 v |= buf[7] << 4;
  86                 v |= buf[8] << 12;
  87                 a[4] += v & 0x01FFF;
  88                 v >>= 13;
  89                 v |= buf[9] << 7;
  90                 a[5] += v & 0x01FFF;
  91                 v >>= 13;
  92                 v |= buf[10] << 2;
  93                 v |= buf[11] << 10;
  94                 a[6] += v & 0x01FFF;
  95                 v >>= 13;
  96                 v |= buf[12] << 5;
  97                 a[7] += v & 0x01FFF;
  98                 v = br_dec16le(buf + 13);
  99                 a[8] += v & 0x01FFF;
 100                 v >>= 13;
 101                 v |= buf[15] << 3;
 102                 a[9] += v | 0x00800;
 103
 104                 /*
 105                  * At that point, all a[] values fit on 14 bits, while
 106                  * all r[] values fit on 13 bits. Thus products fit on
 107                  * 27 bits, and we can accumulate up to 31 of them in
 108                  * a 32-bit word and still have some room for carries.
 109                  */
 110
 111                 /*
 112                  * Now a[] contains words with values up to 14 bits each.
 113                  * We perform the multiplication with r[].
 114                  *
 115                  * The extended words of r[] may be larger than 13 bits
 116                  * (they are 5 times a 13-bit word) so the full summation
 117                  * may yield values up to 46 times a 27-bit word, which
 118                  * does not fit on a 32-bit word. To avoid that issue, we
 119                  * must split the loop below in two, with a carry
 120                  * propagation operation in the middle.
 121                  */
 122                 cc1 = 0;
 123                 for (u = 0; u < 10; u ++) {
 124                         uint32_t s;
 125
 126                         s = cc1
 127                                 + MUL15(a[0], r[u + 9 - 0])
 128                                 + MUL15(a[1], r[u + 9 - 1])
 129                                 + MUL15(a[2], r[u + 9 - 2])
 130                                 + MUL15(a[3], r[u + 9 - 3])
 131                                 + MUL15(a[4], r[u + 9 - 4]);
 132                         b[u] = s & 0x1FFF;
 133                         cc1 = s >> 13;
 134                 }
 135                 cc2 = 0;
 136                 for (u = 0; u < 10; u ++) {
 137                         uint32_t s;
 138
 139                         s = b[u] + cc2
 140                                 + MUL15(a[5], r[u + 9 - 5])
 141                                 + MUL15(a[6], r[u + 9 - 6])
 142                                 + MUL15(a[7], r[u + 9 - 7])
 143                                 + MUL15(a[8], r[u + 9 - 8])
 144                                 + MUL15(a[9], r[u + 9 - 9]);
 145                         b[u] = s & 0x1FFF;
 146                         cc2 = s >> 13;
 147                 }
 148                 memcpy(a, b, sizeof b);
 149
 150                 /*
 151                  * The two carries "loop back" with a factor of 5. We
 152                  * propagate them into a[0] and a[1].
 153                  */
 154                 z = cc1 + cc2;
 155                 z += (z << 2) + a[0];
 156                 a[0] = z & 0x1FFF;
 157                 a[1] += z >> 13;
 158
 159                 buf += 16;
 160                 len -= 16;
 161         }
 162 }
 163
 164 /* see bearssl_block.h */
 165 void
 166 br_poly1305_ctmul32_run(const void *key, const void *iv,
 167         void *data, size_t len, const void *aad, size_t aad_len,
 168         void *tag, br_chacha20_run ichacha, int encrypt)
 169 {
 170         unsigned char pkey[32], foot[16];
 171         uint32_t z, r[19], acc[10], cc, ctl;
 172         int i;
 173
 174         /*
 175          * Compute the MAC key. The 'r' value is the first 16 bytes of
 176          * pkey[].
 177          */
 178         memset(pkey, 0, sizeof pkey);
 179         ichacha(key, iv, 0, pkey, sizeof pkey);
 180
 181         /*
 182          * If encrypting, ChaCha20 must run first, followed by Poly1305.
 183          * When decrypting, the operations are reversed.
 184          */
 185         if (encrypt) {
 186                 ichacha(key, iv, 1, data, len);
 187         }
 188
 189         /*
 190          * Run Poly1305. We must process the AAD, then ciphertext, then
 191          * the footer (with the lengths). Note that the AAD and ciphertext
 192          * are meant to be padded with zeros up to the next multiple of 16,
 193          * and the length of the footer is 16 bytes as well.
 194          */
 195
 196         /*
 197          * Decode the 'r' value into 13-bit words, with the "clamping"
 198          * operation applied.
 199          */
 200         z = br_dec32le(pkey) & 0x03FFFFFF;
 201         r[9] = z & 0x1FFF;
 202         r[10] = z >> 13;
 203         z = (br_dec32le(pkey +  3) >> 2) & 0x03FFFF03;
 204         r[11] = z & 0x1FFF;
 205         r[12] = z >> 13;
 206         z = (br_dec32le(pkey +  6) >> 4) & 0x03FFC0FF;
 207         r[13] = z & 0x1FFF;
 208         r[14] = z >> 13;
 209         z = (br_dec32le(pkey +  9) >> 6) & 0x03F03FFF;
 210         r[15] = z & 0x1FFF;
 211         r[16] = z >> 13;
 212         z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
 213         r[17] = z & 0x1FFF;
 214         r[18] = z >> 13;
 215
 216         /*
 217          * Extend r[] with the 5x factor pre-applied.
 218          */
 219         for (i = 0; i < 9; i ++) {
 220                 r[i] = MUL15(5, r[i + 10]);
 221         }
 222
 223         /*
 224          * Accumulator is 0.
 225          */
 226         memset(acc, 0, sizeof acc);
 227
 228         /*
 229          * Process the additional authenticated data, ciphertext, and
 230          * footer in due order.
 231          */
 232         br_enc64le(foot, (uint64_t)aad_len);
 233         br_enc64le(foot + 8, (uint64_t)len);
 234         poly1305_inner(acc, r, aad, aad_len);
 235         poly1305_inner(acc, r, data, len);
 236         poly1305_inner(acc, r, foot, sizeof foot);
 237
 238         /*
 239          * Finalise modular reduction. This is done with carry propagation
 240          * and applying the '2^130 = -5 mod p' rule. Note that the output
 241          * of poly1035_inner() is already mostly reduced, since only
 242          * acc[1] may be (very slightly) above 2^13. A single loop back
 243          * to acc[1] will be enough to make the value fit in 130 bits.
 244          */
 245         cc = 0;
 246         for (i = 1; i < 10; i ++) {
 247                 z = acc[i] + cc;
 248                 acc[i] = z & 0x1FFF;
 249                 cc = z >> 13;
 250         }
 251         z = acc[0] + cc + (cc << 2);
 252         acc[0] = z & 0x1FFF;
 253         acc[1] += z >> 13;
 254
 255         /*
 256          * We may still have a value in the 2^130-5..2^130-1 range, in
 257          * which case we must reduce it again. The code below selects,
 258          * in constant-time, between 'acc' and 'acc-p',
 259          */
 260         ctl = GT(acc[0], 0x1FFA);
 261         for (i = 1; i < 10; i ++) {
 262                 ctl &= EQ(acc[i], 0x1FFF);
 263         }
 264         acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]);
 265         for (i = 1; i < 10; i ++) {
 266                 acc[i] &= ~(-ctl);
 267         }
 268
 269         /*
 270          * Convert back the accumulator to 32-bit words, and add the
 271          * 's' value (second half of pkey[]). That addition is done
 272          * modulo 2^128.
 273          */
 274         z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16);
 275         br_enc16le((unsigned char *)tag, z & 0xFFFF);
 276         z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18);
 277         br_enc16le((unsigned char *)tag + 2, z & 0xFFFF);
 278         z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20);
 279         br_enc16le((unsigned char *)tag + 4, z & 0xFFFF);
 280         z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22);
 281         br_enc16le((unsigned char *)tag + 6, z & 0xFFFF);
 282         z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24);
 283         br_enc16le((unsigned char *)tag + 8, z & 0xFFFF);
 284         z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26);
 285         br_enc16le((unsigned char *)tag + 10, z & 0xFFFF);
 286         z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28);
 287         br_enc16le((unsigned char *)tag + 12, z & 0xFFFF);
 288         z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30);
 289         br_enc16le((unsigned char *)tag + 14, z & 0xFFFF);
 290
 291         /*
 292          * If decrypting, then ChaCha20 runs _after_ Poly1305.
 293          */
 294         if (!encrypt) {
 295                 ichacha(key, iv, 1, data, len);
 296         }
 297 }