contrib/bearssl/src/ec/ec_p256_m15.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  29  * that right-shifting a signed negative integer copies the sign bit
  30  * (arithmetic right-shift). This is "implementation-defined behaviour",
  31  * i.e. it is not undefined, but it may differ between compilers. Each
  32  * compiler is supposed to document its behaviour in that respect. GCC
  33  * explicitly defines that an arithmetic right shift is used. We expect
  34  * all other compilers to do the same, because underlying CPU offer an
  35  * arithmetic right shift opcode that could not be used otherwise.
  36  */
  37 #if BR_NO_ARITH_SHIFT
  38 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
  39                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  40 #else
  41 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
  42 #endif
  43
  44 /*
  45  * Convert an integer from unsigned big-endian encoding to a sequence of
  46  * 13-bit words in little-endian order. The final "partial" word is
  47  * returned.
  48  */
  49 static uint32_t
  50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
  51 {
  52         uint32_t acc;
  53         int acc_len;
  54
  55         acc = 0;
  56         acc_len = 0;
  57         while (len -- > 0) {
  58                 acc |= (uint32_t)src[len] << acc_len;
  59                 acc_len += 8;
  60                 if (acc_len >= 13) {
  61                         *dst ++ = acc & 0x1FFF;
  62                         acc >>= 13;
  63                         acc_len -= 13;
  64                 }
  65         }
  66         return acc;
  67 }
  68
  69 /*
  70  * Convert an integer (13-bit words, little-endian) to unsigned
  71  * big-endian encoding. The total encoding length is provided; all
  72  * the destination bytes will be filled.
  73  */
  74 static void
  75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
  76 {
  77         uint32_t acc;
  78         int acc_len;
  79
  80         acc = 0;
  81         acc_len = 0;
  82         while (len -- > 0) {
  83                 if (acc_len < 8) {
  84                         acc |= (*src ++) << acc_len;
  85                         acc_len += 13;
  86                 }
  87                 dst[len] = (unsigned char)acc;
  88                 acc >>= 8;
  89                 acc_len -= 8;
  90         }
  91 }
  92
  93 /*
  94  * Normalise an array of words to a strict 13 bits per word. Returned
  95  * value is the resulting carry. The source (w) and destination (d)
  96  * arrays may be identical, but shall not overlap partially.
  97  */
  98 static inline uint32_t
  99 norm13(uint32_t *d, const uint32_t *w, size_t len)
 100 {
 101         size_t u;
 102         uint32_t cc;
 103
 104         cc = 0;
 105         for (u = 0; u < len; u ++) {
 106                 int32_t z;
 107
 108                 z = w[u] + cc;
 109                 d[u] = z & 0x1FFF;
 110                 cc = ARSH(z, 13);
 111         }
 112         return cc;
 113 }
 114
 115 /*
 116  * mul20() multiplies two 260-bit integers together. Each word must fit
 117  * on 13 bits; source operands use 20 words, destination operand
 118  * receives 40 words. All overlaps allowed.
 119  *
 120  * square20() computes the square of a 260-bit integer. Each word must
 121  * fit on 13 bits; source operand uses 20 words, destination operand
 122  * receives 40 words. All overlaps allowed.
 123  */
 124
 125 #if BR_SLOW_MUL15
 126
 127 static void
 128 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 129 {
 130         /*
 131          * Two-level Karatsuba: turns a 20x20 multiplication into
 132          * nine 5x5 multiplications. We use 13-bit words but do not
 133          * propagate carries immediately, so words may expand:
 134          *
 135          *  - First Karatsuba decomposition turns the 20x20 mul on
 136          *    13-bit words into three 10x10 muls, two on 13-bit words
 137          *    and one on 14-bit words.
 138          *
 139          *  - Second Karatsuba decomposition further splits these into:
 140          *
 141          *     * four 5x5 muls on 13-bit words
 142          *     * four 5x5 muls on 14-bit words
 143          *     * one 5x5 mul on 15-bit words
 144          *
 145          * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
 146          * or 15-bit words, respectively.
 147          */
 148         uint32_t u[45], v[45], w[90];
 149         uint32_t cc;
 150         int i;
 151
 152 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 153                 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
 154                         + (s2w)[5 * (s2_off) + 0]; \
 155                 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
 156                         + (s2w)[5 * (s2_off) + 1]; \
 157                 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
 158                         + (s2w)[5 * (s2_off) + 2]; \
 159                 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
 160                         + (s2w)[5 * (s2_off) + 3]; \
 161                 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
 162                         + (s2w)[5 * (s2_off) + 4]; \
 163         } while (0)
 164
 165 #define ZADDT(dw, d_off, sw, s_off)   do { \
 166                 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
 167                 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
 168                 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
 169                 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
 170                 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
 171         } while (0)
 172
 173 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 174                 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
 175                         + (s2w)[5 * (s2_off) + 0]; \
 176                 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
 177                         + (s2w)[5 * (s2_off) + 1]; \
 178                 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
 179                         + (s2w)[5 * (s2_off) + 2]; \
 180                 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
 181                         + (s2w)[5 * (s2_off) + 3]; \
 182                 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
 183                         + (s2w)[5 * (s2_off) + 4]; \
 184         } while (0)
 185
 186 #define CPR1(w, cprcc)   do { \
 187                 uint32_t cprz = (w) + cprcc; \
 188                 (w) = cprz & 0x1FFF; \
 189                 cprcc = cprz >> 13; \
 190         } while (0)
 191
 192 #define CPR(dw, d_off)   do { \
 193                 uint32_t cprcc; \
 194                 cprcc = 0; \
 195                 CPR1((dw)[(d_off) + 0], cprcc); \
 196                 CPR1((dw)[(d_off) + 1], cprcc); \
 197                 CPR1((dw)[(d_off) + 2], cprcc); \
 198                 CPR1((dw)[(d_off) + 3], cprcc); \
 199                 CPR1((dw)[(d_off) + 4], cprcc); \
 200                 CPR1((dw)[(d_off) + 5], cprcc); \
 201                 CPR1((dw)[(d_off) + 6], cprcc); \
 202                 CPR1((dw)[(d_off) + 7], cprcc); \
 203                 CPR1((dw)[(d_off) + 8], cprcc); \
 204                 (dw)[(d_off) + 9] = cprcc; \
 205         } while (0)
 206
 207         memcpy(u, a, 20 * sizeof *a);
 208         ZADD(u, 4, a, 0, a, 1);
 209         ZADD(u, 5, a, 2, a, 3);
 210         ZADD(u, 6, a, 0, a, 2);
 211         ZADD(u, 7, a, 1, a, 3);
 212         ZADD(u, 8, u, 6, u, 7);
 213
 214         memcpy(v, b, 20 * sizeof *b);
 215         ZADD(v, 4, b, 0, b, 1);
 216         ZADD(v, 5, b, 2, b, 3);
 217         ZADD(v, 6, b, 0, b, 2);
 218         ZADD(v, 7, b, 1, b, 3);
 219         ZADD(v, 8, v, 6, v, 7);
 220
 221         /*
 222          * Do the eight first 8x8 muls. Source words are at most 16382
 223          * each, so we can add product results together "as is" in 32-bit
 224          * words.
 225          */
 226         for (i = 0; i < 40; i += 5) {
 227                 w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
 228                 w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
 229                         + MUL15(u[i + 1], v[i + 0]);
 230                 w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
 231                         + MUL15(u[i + 1], v[i + 1])
 232                         + MUL15(u[i + 2], v[i + 0]);
 233                 w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
 234                         + MUL15(u[i + 1], v[i + 2])
 235                         + MUL15(u[i + 2], v[i + 1])
 236                         + MUL15(u[i + 3], v[i + 0]);
 237                 w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
 238                         + MUL15(u[i + 1], v[i + 3])
 239                         + MUL15(u[i + 2], v[i + 2])
 240                         + MUL15(u[i + 3], v[i + 1])
 241                         + MUL15(u[i + 4], v[i + 0]);
 242                 w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
 243                         + MUL15(u[i + 2], v[i + 3])
 244                         + MUL15(u[i + 3], v[i + 2])
 245                         + MUL15(u[i + 4], v[i + 1]);
 246                 w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
 247                         + MUL15(u[i + 3], v[i + 3])
 248                         + MUL15(u[i + 4], v[i + 2]);
 249                 w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
 250                         + MUL15(u[i + 4], v[i + 3]);
 251                 w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
 252                 w[(i << 1) + 9] = 0;
 253         }
 254
 255         /*
 256          * For the 9th multiplication, source words are up to 32764,
 257          * so we must do some carry propagation. If we add up to
 258          * 4 products and the carry is no more than 524224, then the
 259          * result fits in 32 bits, and the next carry will be no more
 260          * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
 261          *
 262          * We thus just skip one of the products in the middle word,
 263          * then do a carry propagation (this reduces words to 13 bits
 264          * each, except possibly the last, which may use up to 17 bits
 265          * or so), then add the missing product.
 266          */
 267         w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
 268         w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
 269                 + MUL15(u[40 + 1], v[40 + 0]);
 270         w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
 271                 + MUL15(u[40 + 1], v[40 + 1])
 272                 + MUL15(u[40 + 2], v[40 + 0]);
 273         w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
 274                 + MUL15(u[40 + 1], v[40 + 2])
 275                 + MUL15(u[40 + 2], v[40 + 1])
 276                 + MUL15(u[40 + 3], v[40 + 0]);
 277         w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
 278                 + MUL15(u[40 + 1], v[40 + 3])
 279                 + MUL15(u[40 + 2], v[40 + 2])
 280                 + MUL15(u[40 + 3], v[40 + 1]);
 281                 /* + MUL15(u[40 + 4], v[40 + 0]) */
 282         w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
 283                 + MUL15(u[40 + 2], v[40 + 3])
 284                 + MUL15(u[40 + 3], v[40 + 2])
 285                 + MUL15(u[40 + 4], v[40 + 1]);
 286         w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
 287                 + MUL15(u[40 + 3], v[40 + 3])
 288                 + MUL15(u[40 + 4], v[40 + 2]);
 289         w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
 290                 + MUL15(u[40 + 4], v[40 + 3]);
 291         w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
 292
 293         CPR(w, 80);
 294
 295         w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
 296
 297         /*
 298          * The products on 14-bit words in slots 6 and 7 yield values
 299          * up to 5*(16382^2) each, and we need to subtract two such
 300          * values from the higher word. We need the subtraction to fit
 301          * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
 302          * However, 10*(16382^2) does not fit. So we must perform a
 303          * bit of reduction here.
 304          */
 305         CPR(w, 60);
 306         CPR(w, 70);
 307
 308         /*
 309          * Recompose results.
 310          */
 311
 312         /* 0..1*0..1 into 0..3 */
 313         ZSUB2F(w, 8, w, 0, w, 2);
 314         ZSUB2F(w, 9, w, 1, w, 3);
 315         ZADDT(w, 1, w, 8);
 316         ZADDT(w, 2, w, 9);
 317
 318         /* 2..3*2..3 into 4..7 */
 319         ZSUB2F(w, 10, w, 4, w, 6);
 320         ZSUB2F(w, 11, w, 5, w, 7);
 321         ZADDT(w, 5, w, 10);
 322         ZADDT(w, 6, w, 11);
 323
 324         /* (0..1+2..3)*(0..1+2..3) into 12..15 */
 325         ZSUB2F(w, 16, w, 12, w, 14);
 326         ZSUB2F(w, 17, w, 13, w, 15);
 327         ZADDT(w, 13, w, 16);
 328         ZADDT(w, 14, w, 17);
 329
 330         /* first-level recomposition */
 331         ZSUB2F(w, 12, w, 0, w, 4);
 332         ZSUB2F(w, 13, w, 1, w, 5);
 333         ZSUB2F(w, 14, w, 2, w, 6);
 334         ZSUB2F(w, 15, w, 3, w, 7);
 335         ZADDT(w, 2, w, 12);
 336         ZADDT(w, 3, w, 13);
 337         ZADDT(w, 4, w, 14);
 338         ZADDT(w, 5, w, 15);
 339
 340         /*
 341          * Perform carry propagation to bring all words down to 13 bits.
 342          */
 343         cc = norm13(d, w, 40);
 344         d[39] += (cc << 13);
 345
 346 #undef ZADD
 347 #undef ZADDT
 348 #undef ZSUB2F
 349 #undef CPR1
 350 #undef CPR
 351 }
 352
 353 static inline void
 354 square20(uint32_t *d, const uint32_t *a)
 355 {
 356         mul20(d, a, a);
 357 }
 358
 359 #else
 360
 361 static void
 362 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 363 {
 364         uint32_t t[39];
 365
 366         t[ 0] = MUL15(a[ 0], b[ 0]);
 367         t[ 1] = MUL15(a[ 0], b[ 1])
 368                 + MUL15(a[ 1], b[ 0]);
 369         t[ 2] = MUL15(a[ 0], b[ 2])
 370                 + MUL15(a[ 1], b[ 1])
 371                 + MUL15(a[ 2], b[ 0]);
 372         t[ 3] = MUL15(a[ 0], b[ 3])
 373                 + MUL15(a[ 1], b[ 2])
 374                 + MUL15(a[ 2], b[ 1])
 375                 + MUL15(a[ 3], b[ 0]);
 376         t[ 4] = MUL15(a[ 0], b[ 4])
 377                 + MUL15(a[ 1], b[ 3])
 378                 + MUL15(a[ 2], b[ 2])
 379                 + MUL15(a[ 3], b[ 1])
 380                 + MUL15(a[ 4], b[ 0]);
 381         t[ 5] = MUL15(a[ 0], b[ 5])
 382                 + MUL15(a[ 1], b[ 4])
 383                 + MUL15(a[ 2], b[ 3])
 384                 + MUL15(a[ 3], b[ 2])
 385                 + MUL15(a[ 4], b[ 1])
 386                 + MUL15(a[ 5], b[ 0]);
 387         t[ 6] = MUL15(a[ 0], b[ 6])
 388                 + MUL15(a[ 1], b[ 5])
 389                 + MUL15(a[ 2], b[ 4])
 390                 + MUL15(a[ 3], b[ 3])
 391                 + MUL15(a[ 4], b[ 2])
 392                 + MUL15(a[ 5], b[ 1])
 393                 + MUL15(a[ 6], b[ 0]);
 394         t[ 7] = MUL15(a[ 0], b[ 7])
 395                 + MUL15(a[ 1], b[ 6])
 396                 + MUL15(a[ 2], b[ 5])
 397                 + MUL15(a[ 3], b[ 4])
 398                 + MUL15(a[ 4], b[ 3])
 399                 + MUL15(a[ 5], b[ 2])
 400                 + MUL15(a[ 6], b[ 1])
 401                 + MUL15(a[ 7], b[ 0]);
 402         t[ 8] = MUL15(a[ 0], b[ 8])
 403                 + MUL15(a[ 1], b[ 7])
 404                 + MUL15(a[ 2], b[ 6])
 405                 + MUL15(a[ 3], b[ 5])
 406                 + MUL15(a[ 4], b[ 4])
 407                 + MUL15(a[ 5], b[ 3])
 408                 + MUL15(a[ 6], b[ 2])
 409                 + MUL15(a[ 7], b[ 1])
 410                 + MUL15(a[ 8], b[ 0]);
 411         t[ 9] = MUL15(a[ 0], b[ 9])
 412                 + MUL15(a[ 1], b[ 8])
 413                 + MUL15(a[ 2], b[ 7])
 414                 + MUL15(a[ 3], b[ 6])
 415                 + MUL15(a[ 4], b[ 5])
 416                 + MUL15(a[ 5], b[ 4])
 417                 + MUL15(a[ 6], b[ 3])
 418                 + MUL15(a[ 7], b[ 2])
 419                 + MUL15(a[ 8], b[ 1])
 420                 + MUL15(a[ 9], b[ 0]);
 421         t[10] = MUL15(a[ 0], b[10])
 422                 + MUL15(a[ 1], b[ 9])
 423                 + MUL15(a[ 2], b[ 8])
 424                 + MUL15(a[ 3], b[ 7])
 425                 + MUL15(a[ 4], b[ 6])
 426                 + MUL15(a[ 5], b[ 5])
 427                 + MUL15(a[ 6], b[ 4])
 428                 + MUL15(a[ 7], b[ 3])
 429                 + MUL15(a[ 8], b[ 2])
 430                 + MUL15(a[ 9], b[ 1])
 431                 + MUL15(a[10], b[ 0]);
 432         t[11] = MUL15(a[ 0], b[11])
 433                 + MUL15(a[ 1], b[10])
 434                 + MUL15(a[ 2], b[ 9])
 435                 + MUL15(a[ 3], b[ 8])
 436                 + MUL15(a[ 4], b[ 7])
 437                 + MUL15(a[ 5], b[ 6])
 438                 + MUL15(a[ 6], b[ 5])
 439                 + MUL15(a[ 7], b[ 4])
 440                 + MUL15(a[ 8], b[ 3])
 441                 + MUL15(a[ 9], b[ 2])
 442                 + MUL15(a[10], b[ 1])
 443                 + MUL15(a[11], b[ 0]);
 444         t[12] = MUL15(a[ 0], b[12])
 445                 + MUL15(a[ 1], b[11])
 446                 + MUL15(a[ 2], b[10])
 447                 + MUL15(a[ 3], b[ 9])
 448                 + MUL15(a[ 4], b[ 8])
 449                 + MUL15(a[ 5], b[ 7])
 450                 + MUL15(a[ 6], b[ 6])
 451                 + MUL15(a[ 7], b[ 5])
 452                 + MUL15(a[ 8], b[ 4])
 453                 + MUL15(a[ 9], b[ 3])
 454                 + MUL15(a[10], b[ 2])
 455                 + MUL15(a[11], b[ 1])
 456                 + MUL15(a[12], b[ 0]);
 457         t[13] = MUL15(a[ 0], b[13])
 458                 + MUL15(a[ 1], b[12])
 459                 + MUL15(a[ 2], b[11])
 460                 + MUL15(a[ 3], b[10])
 461                 + MUL15(a[ 4], b[ 9])
 462                 + MUL15(a[ 5], b[ 8])
 463                 + MUL15(a[ 6], b[ 7])
 464                 + MUL15(a[ 7], b[ 6])
 465                 + MUL15(a[ 8], b[ 5])
 466                 + MUL15(a[ 9], b[ 4])
 467                 + MUL15(a[10], b[ 3])
 468                 + MUL15(a[11], b[ 2])
 469                 + MUL15(a[12], b[ 1])
 470                 + MUL15(a[13], b[ 0]);
 471         t[14] = MUL15(a[ 0], b[14])
 472                 + MUL15(a[ 1], b[13])
 473                 + MUL15(a[ 2], b[12])
 474                 + MUL15(a[ 3], b[11])
 475                 + MUL15(a[ 4], b[10])
 476                 + MUL15(a[ 5], b[ 9])
 477                 + MUL15(a[ 6], b[ 8])
 478                 + MUL15(a[ 7], b[ 7])
 479                 + MUL15(a[ 8], b[ 6])
 480                 + MUL15(a[ 9], b[ 5])
 481                 + MUL15(a[10], b[ 4])
 482                 + MUL15(a[11], b[ 3])
 483                 + MUL15(a[12], b[ 2])
 484                 + MUL15(a[13], b[ 1])
 485                 + MUL15(a[14], b[ 0]);
 486         t[15] = MUL15(a[ 0], b[15])
 487                 + MUL15(a[ 1], b[14])
 488                 + MUL15(a[ 2], b[13])
 489                 + MUL15(a[ 3], b[12])
 490                 + MUL15(a[ 4], b[11])
 491                 + MUL15(a[ 5], b[10])
 492                 + MUL15(a[ 6], b[ 9])
 493                 + MUL15(a[ 7], b[ 8])
 494                 + MUL15(a[ 8], b[ 7])
 495                 + MUL15(a[ 9], b[ 6])
 496                 + MUL15(a[10], b[ 5])
 497                 + MUL15(a[11], b[ 4])
 498                 + MUL15(a[12], b[ 3])
 499                 + MUL15(a[13], b[ 2])
 500                 + MUL15(a[14], b[ 1])
 501                 + MUL15(a[15], b[ 0]);
 502         t[16] = MUL15(a[ 0], b[16])
 503                 + MUL15(a[ 1], b[15])
 504                 + MUL15(a[ 2], b[14])
 505                 + MUL15(a[ 3], b[13])
 506                 + MUL15(a[ 4], b[12])
 507                 + MUL15(a[ 5], b[11])
 508                 + MUL15(a[ 6], b[10])
 509                 + MUL15(a[ 7], b[ 9])
 510                 + MUL15(a[ 8], b[ 8])
 511                 + MUL15(a[ 9], b[ 7])
 512                 + MUL15(a[10], b[ 6])
 513                 + MUL15(a[11], b[ 5])
 514                 + MUL15(a[12], b[ 4])
 515                 + MUL15(a[13], b[ 3])
 516                 + MUL15(a[14], b[ 2])
 517                 + MUL15(a[15], b[ 1])
 518                 + MUL15(a[16], b[ 0]);
 519         t[17] = MUL15(a[ 0], b[17])
 520                 + MUL15(a[ 1], b[16])
 521                 + MUL15(a[ 2], b[15])
 522                 + MUL15(a[ 3], b[14])
 523                 + MUL15(a[ 4], b[13])
 524                 + MUL15(a[ 5], b[12])
 525                 + MUL15(a[ 6], b[11])
 526                 + MUL15(a[ 7], b[10])
 527                 + MUL15(a[ 8], b[ 9])
 528                 + MUL15(a[ 9], b[ 8])
 529                 + MUL15(a[10], b[ 7])
 530                 + MUL15(a[11], b[ 6])
 531                 + MUL15(a[12], b[ 5])
 532                 + MUL15(a[13], b[ 4])
 533                 + MUL15(a[14], b[ 3])
 534                 + MUL15(a[15], b[ 2])
 535                 + MUL15(a[16], b[ 1])
 536                 + MUL15(a[17], b[ 0]);
 537         t[18] = MUL15(a[ 0], b[18])
 538                 + MUL15(a[ 1], b[17])
 539                 + MUL15(a[ 2], b[16])
 540                 + MUL15(a[ 3], b[15])
 541                 + MUL15(a[ 4], b[14])
 542                 + MUL15(a[ 5], b[13])
 543                 + MUL15(a[ 6], b[12])
 544                 + MUL15(a[ 7], b[11])
 545                 + MUL15(a[ 8], b[10])
 546                 + MUL15(a[ 9], b[ 9])
 547                 + MUL15(a[10], b[ 8])
 548                 + MUL15(a[11], b[ 7])
 549                 + MUL15(a[12], b[ 6])
 550                 + MUL15(a[13], b[ 5])
 551                 + MUL15(a[14], b[ 4])
 552                 + MUL15(a[15], b[ 3])
 553                 + MUL15(a[16], b[ 2])
 554                 + MUL15(a[17], b[ 1])
 555                 + MUL15(a[18], b[ 0]);
 556         t[19] = MUL15(a[ 0], b[19])
 557                 + MUL15(a[ 1], b[18])
 558                 + MUL15(a[ 2], b[17])
 559                 + MUL15(a[ 3], b[16])
 560                 + MUL15(a[ 4], b[15])
 561                 + MUL15(a[ 5], b[14])
 562                 + MUL15(a[ 6], b[13])
 563                 + MUL15(a[ 7], b[12])
 564                 + MUL15(a[ 8], b[11])
 565                 + MUL15(a[ 9], b[10])
 566                 + MUL15(a[10], b[ 9])
 567                 + MUL15(a[11], b[ 8])
 568                 + MUL15(a[12], b[ 7])
 569                 + MUL15(a[13], b[ 6])
 570                 + MUL15(a[14], b[ 5])
 571                 + MUL15(a[15], b[ 4])
 572                 + MUL15(a[16], b[ 3])
 573                 + MUL15(a[17], b[ 2])
 574                 + MUL15(a[18], b[ 1])
 575                 + MUL15(a[19], b[ 0]);
 576         t[20] = MUL15(a[ 1], b[19])
 577                 + MUL15(a[ 2], b[18])
 578                 + MUL15(a[ 3], b[17])
 579                 + MUL15(a[ 4], b[16])
 580                 + MUL15(a[ 5], b[15])
 581                 + MUL15(a[ 6], b[14])
 582                 + MUL15(a[ 7], b[13])
 583                 + MUL15(a[ 8], b[12])
 584                 + MUL15(a[ 9], b[11])
 585                 + MUL15(a[10], b[10])
 586                 + MUL15(a[11], b[ 9])
 587                 + MUL15(a[12], b[ 8])
 588                 + MUL15(a[13], b[ 7])
 589                 + MUL15(a[14], b[ 6])
 590                 + MUL15(a[15], b[ 5])
 591                 + MUL15(a[16], b[ 4])
 592                 + MUL15(a[17], b[ 3])
 593                 + MUL15(a[18], b[ 2])
 594                 + MUL15(a[19], b[ 1]);
 595         t[21] = MUL15(a[ 2], b[19])
 596                 + MUL15(a[ 3], b[18])
 597                 + MUL15(a[ 4], b[17])
 598                 + MUL15(a[ 5], b[16])
 599                 + MUL15(a[ 6], b[15])
 600                 + MUL15(a[ 7], b[14])
 601                 + MUL15(a[ 8], b[13])
 602                 + MUL15(a[ 9], b[12])
 603                 + MUL15(a[10], b[11])
 604                 + MUL15(a[11], b[10])
 605                 + MUL15(a[12], b[ 9])
 606                 + MUL15(a[13], b[ 8])
 607                 + MUL15(a[14], b[ 7])
 608                 + MUL15(a[15], b[ 6])
 609                 + MUL15(a[16], b[ 5])
 610                 + MUL15(a[17], b[ 4])
 611                 + MUL15(a[18], b[ 3])
 612                 + MUL15(a[19], b[ 2]);
 613         t[22] = MUL15(a[ 3], b[19])
 614                 + MUL15(a[ 4], b[18])
 615                 + MUL15(a[ 5], b[17])
 616                 + MUL15(a[ 6], b[16])
 617                 + MUL15(a[ 7], b[15])
 618                 + MUL15(a[ 8], b[14])
 619                 + MUL15(a[ 9], b[13])
 620                 + MUL15(a[10], b[12])
 621                 + MUL15(a[11], b[11])
 622                 + MUL15(a[12], b[10])
 623                 + MUL15(a[13], b[ 9])
 624                 + MUL15(a[14], b[ 8])
 625                 + MUL15(a[15], b[ 7])
 626                 + MUL15(a[16], b[ 6])
 627                 + MUL15(a[17], b[ 5])
 628                 + MUL15(a[18], b[ 4])
 629                 + MUL15(a[19], b[ 3]);
 630         t[23] = MUL15(a[ 4], b[19])
 631                 + MUL15(a[ 5], b[18])
 632                 + MUL15(a[ 6], b[17])
 633                 + MUL15(a[ 7], b[16])
 634                 + MUL15(a[ 8], b[15])
 635                 + MUL15(a[ 9], b[14])
 636                 + MUL15(a[10], b[13])
 637                 + MUL15(a[11], b[12])
 638                 + MUL15(a[12], b[11])
 639                 + MUL15(a[13], b[10])
 640                 + MUL15(a[14], b[ 9])
 641                 + MUL15(a[15], b[ 8])
 642                 + MUL15(a[16], b[ 7])
 643                 + MUL15(a[17], b[ 6])
 644                 + MUL15(a[18], b[ 5])
 645                 + MUL15(a[19], b[ 4]);
 646         t[24] = MUL15(a[ 5], b[19])
 647                 + MUL15(a[ 6], b[18])
 648                 + MUL15(a[ 7], b[17])
 649                 + MUL15(a[ 8], b[16])
 650                 + MUL15(a[ 9], b[15])
 651                 + MUL15(a[10], b[14])
 652                 + MUL15(a[11], b[13])
 653                 + MUL15(a[12], b[12])
 654                 + MUL15(a[13], b[11])
 655                 + MUL15(a[14], b[10])
 656                 + MUL15(a[15], b[ 9])
 657                 + MUL15(a[16], b[ 8])
 658                 + MUL15(a[17], b[ 7])
 659                 + MUL15(a[18], b[ 6])
 660                 + MUL15(a[19], b[ 5]);
 661         t[25] = MUL15(a[ 6], b[19])
 662                 + MUL15(a[ 7], b[18])
 663                 + MUL15(a[ 8], b[17])
 664                 + MUL15(a[ 9], b[16])
 665                 + MUL15(a[10], b[15])
 666                 + MUL15(a[11], b[14])
 667                 + MUL15(a[12], b[13])
 668                 + MUL15(a[13], b[12])
 669                 + MUL15(a[14], b[11])
 670                 + MUL15(a[15], b[10])
 671                 + MUL15(a[16], b[ 9])
 672                 + MUL15(a[17], b[ 8])
 673                 + MUL15(a[18], b[ 7])
 674                 + MUL15(a[19], b[ 6]);
 675         t[26] = MUL15(a[ 7], b[19])
 676                 + MUL15(a[ 8], b[18])
 677                 + MUL15(a[ 9], b[17])
 678                 + MUL15(a[10], b[16])
 679                 + MUL15(a[11], b[15])
 680                 + MUL15(a[12], b[14])
 681                 + MUL15(a[13], b[13])
 682                 + MUL15(a[14], b[12])
 683                 + MUL15(a[15], b[11])
 684                 + MUL15(a[16], b[10])
 685                 + MUL15(a[17], b[ 9])
 686                 + MUL15(a[18], b[ 8])
 687                 + MUL15(a[19], b[ 7]);
 688         t[27] = MUL15(a[ 8], b[19])
 689                 + MUL15(a[ 9], b[18])
 690                 + MUL15(a[10], b[17])
 691                 + MUL15(a[11], b[16])
 692                 + MUL15(a[12], b[15])
 693                 + MUL15(a[13], b[14])
 694                 + MUL15(a[14], b[13])
 695                 + MUL15(a[15], b[12])
 696                 + MUL15(a[16], b[11])
 697                 + MUL15(a[17], b[10])
 698                 + MUL15(a[18], b[ 9])
 699                 + MUL15(a[19], b[ 8]);
 700         t[28] = MUL15(a[ 9], b[19])
 701                 + MUL15(a[10], b[18])
 702                 + MUL15(a[11], b[17])
 703                 + MUL15(a[12], b[16])
 704                 + MUL15(a[13], b[15])
 705                 + MUL15(a[14], b[14])
 706                 + MUL15(a[15], b[13])
 707                 + MUL15(a[16], b[12])
 708                 + MUL15(a[17], b[11])
 709                 + MUL15(a[18], b[10])
 710                 + MUL15(a[19], b[ 9]);
 711         t[29] = MUL15(a[10], b[19])
 712                 + MUL15(a[11], b[18])
 713                 + MUL15(a[12], b[17])
 714                 + MUL15(a[13], b[16])
 715                 + MUL15(a[14], b[15])
 716                 + MUL15(a[15], b[14])
 717                 + MUL15(a[16], b[13])
 718                 + MUL15(a[17], b[12])
 719                 + MUL15(a[18], b[11])
 720                 + MUL15(a[19], b[10]);
 721         t[30] = MUL15(a[11], b[19])
 722                 + MUL15(a[12], b[18])
 723                 + MUL15(a[13], b[17])
 724                 + MUL15(a[14], b[16])
 725                 + MUL15(a[15], b[15])
 726                 + MUL15(a[16], b[14])
 727                 + MUL15(a[17], b[13])
 728                 + MUL15(a[18], b[12])
 729                 + MUL15(a[19], b[11]);
 730         t[31] = MUL15(a[12], b[19])
 731                 + MUL15(a[13], b[18])
 732                 + MUL15(a[14], b[17])
 733                 + MUL15(a[15], b[16])
 734                 + MUL15(a[16], b[15])
 735                 + MUL15(a[17], b[14])
 736                 + MUL15(a[18], b[13])
 737                 + MUL15(a[19], b[12]);
 738         t[32] = MUL15(a[13], b[19])
 739                 + MUL15(a[14], b[18])
 740                 + MUL15(a[15], b[17])
 741                 + MUL15(a[16], b[16])
 742                 + MUL15(a[17], b[15])
 743                 + MUL15(a[18], b[14])
 744                 + MUL15(a[19], b[13]);
 745         t[33] = MUL15(a[14], b[19])
 746                 + MUL15(a[15], b[18])
 747                 + MUL15(a[16], b[17])
 748                 + MUL15(a[17], b[16])
 749                 + MUL15(a[18], b[15])
 750                 + MUL15(a[19], b[14]);
 751         t[34] = MUL15(a[15], b[19])
 752                 + MUL15(a[16], b[18])
 753                 + MUL15(a[17], b[17])
 754                 + MUL15(a[18], b[16])
 755                 + MUL15(a[19], b[15]);
 756         t[35] = MUL15(a[16], b[19])
 757                 + MUL15(a[17], b[18])
 758                 + MUL15(a[18], b[17])
 759                 + MUL15(a[19], b[16]);
 760         t[36] = MUL15(a[17], b[19])
 761                 + MUL15(a[18], b[18])
 762                 + MUL15(a[19], b[17]);
 763         t[37] = MUL15(a[18], b[19])
 764                 + MUL15(a[19], b[18]);
 765         t[38] = MUL15(a[19], b[19]);
 766         d[39] = norm13(d, t, 39);
 767 }
 768
 769 static void
 770 square20(uint32_t *d, const uint32_t *a)
 771 {
 772         uint32_t t[39];
 773
 774         t[ 0] = MUL15(a[ 0], a[ 0]);
 775         t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
 776         t[ 2] = MUL15(a[ 1], a[ 1])
 777                 + ((MUL15(a[ 0], a[ 2])) << 1);
 778         t[ 3] = ((MUL15(a[ 0], a[ 3])
 779                 + MUL15(a[ 1], a[ 2])) << 1);
 780         t[ 4] = MUL15(a[ 2], a[ 2])
 781                 + ((MUL15(a[ 0], a[ 4])
 782                 + MUL15(a[ 1], a[ 3])) << 1);
 783         t[ 5] = ((MUL15(a[ 0], a[ 5])
 784                 + MUL15(a[ 1], a[ 4])
 785                 + MUL15(a[ 2], a[ 3])) << 1);
 786         t[ 6] = MUL15(a[ 3], a[ 3])
 787                 + ((MUL15(a[ 0], a[ 6])
 788                 + MUL15(a[ 1], a[ 5])
 789                 + MUL15(a[ 2], a[ 4])) << 1);
 790         t[ 7] = ((MUL15(a[ 0], a[ 7])
 791                 + MUL15(a[ 1], a[ 6])
 792                 + MUL15(a[ 2], a[ 5])
 793                 + MUL15(a[ 3], a[ 4])) << 1);
 794         t[ 8] = MUL15(a[ 4], a[ 4])
 795                 + ((MUL15(a[ 0], a[ 8])
 796                 + MUL15(a[ 1], a[ 7])
 797                 + MUL15(a[ 2], a[ 6])
 798                 + MUL15(a[ 3], a[ 5])) << 1);
 799         t[ 9] = ((MUL15(a[ 0], a[ 9])
 800                 + MUL15(a[ 1], a[ 8])
 801                 + MUL15(a[ 2], a[ 7])
 802                 + MUL15(a[ 3], a[ 6])
 803                 + MUL15(a[ 4], a[ 5])) << 1);
 804         t[10] = MUL15(a[ 5], a[ 5])
 805                 + ((MUL15(a[ 0], a[10])
 806                 + MUL15(a[ 1], a[ 9])
 807                 + MUL15(a[ 2], a[ 8])
 808                 + MUL15(a[ 3], a[ 7])
 809                 + MUL15(a[ 4], a[ 6])) << 1);
 810         t[11] = ((MUL15(a[ 0], a[11])
 811                 + MUL15(a[ 1], a[10])
 812                 + MUL15(a[ 2], a[ 9])
 813                 + MUL15(a[ 3], a[ 8])
 814                 + MUL15(a[ 4], a[ 7])
 815                 + MUL15(a[ 5], a[ 6])) << 1);
 816         t[12] = MUL15(a[ 6], a[ 6])
 817                 + ((MUL15(a[ 0], a[12])
 818                 + MUL15(a[ 1], a[11])
 819                 + MUL15(a[ 2], a[10])
 820                 + MUL15(a[ 3], a[ 9])
 821                 + MUL15(a[ 4], a[ 8])
 822                 + MUL15(a[ 5], a[ 7])) << 1);
 823         t[13] = ((MUL15(a[ 0], a[13])
 824                 + MUL15(a[ 1], a[12])
 825                 + MUL15(a[ 2], a[11])
 826                 + MUL15(a[ 3], a[10])
 827                 + MUL15(a[ 4], a[ 9])
 828                 + MUL15(a[ 5], a[ 8])
 829                 + MUL15(a[ 6], a[ 7])) << 1);
 830         t[14] = MUL15(a[ 7], a[ 7])
 831                 + ((MUL15(a[ 0], a[14])
 832                 + MUL15(a[ 1], a[13])
 833                 + MUL15(a[ 2], a[12])
 834                 + MUL15(a[ 3], a[11])
 835                 + MUL15(a[ 4], a[10])
 836                 + MUL15(a[ 5], a[ 9])
 837                 + MUL15(a[ 6], a[ 8])) << 1);
 838         t[15] = ((MUL15(a[ 0], a[15])
 839                 + MUL15(a[ 1], a[14])
 840                 + MUL15(a[ 2], a[13])
 841                 + MUL15(a[ 3], a[12])
 842                 + MUL15(a[ 4], a[11])
 843                 + MUL15(a[ 5], a[10])
 844                 + MUL15(a[ 6], a[ 9])
 845                 + MUL15(a[ 7], a[ 8])) << 1);
 846         t[16] = MUL15(a[ 8], a[ 8])
 847                 + ((MUL15(a[ 0], a[16])
 848                 + MUL15(a[ 1], a[15])
 849                 + MUL15(a[ 2], a[14])
 850                 + MUL15(a[ 3], a[13])
 851                 + MUL15(a[ 4], a[12])
 852                 + MUL15(a[ 5], a[11])
 853                 + MUL15(a[ 6], a[10])
 854                 + MUL15(a[ 7], a[ 9])) << 1);
 855         t[17] = ((MUL15(a[ 0], a[17])
 856                 + MUL15(a[ 1], a[16])
 857                 + MUL15(a[ 2], a[15])
 858                 + MUL15(a[ 3], a[14])
 859                 + MUL15(a[ 4], a[13])
 860                 + MUL15(a[ 5], a[12])
 861                 + MUL15(a[ 6], a[11])
 862                 + MUL15(a[ 7], a[10])
 863                 + MUL15(a[ 8], a[ 9])) << 1);
 864         t[18] = MUL15(a[ 9], a[ 9])
 865                 + ((MUL15(a[ 0], a[18])
 866                 + MUL15(a[ 1], a[17])
 867                 + MUL15(a[ 2], a[16])
 868                 + MUL15(a[ 3], a[15])
 869                 + MUL15(a[ 4], a[14])
 870                 + MUL15(a[ 5], a[13])
 871                 + MUL15(a[ 6], a[12])
 872                 + MUL15(a[ 7], a[11])
 873                 + MUL15(a[ 8], a[10])) << 1);
 874         t[19] = ((MUL15(a[ 0], a[19])
 875                 + MUL15(a[ 1], a[18])
 876                 + MUL15(a[ 2], a[17])
 877                 + MUL15(a[ 3], a[16])
 878                 + MUL15(a[ 4], a[15])
 879                 + MUL15(a[ 5], a[14])
 880                 + MUL15(a[ 6], a[13])
 881                 + MUL15(a[ 7], a[12])
 882                 + MUL15(a[ 8], a[11])
 883                 + MUL15(a[ 9], a[10])) << 1);
 884         t[20] = MUL15(a[10], a[10])
 885                 + ((MUL15(a[ 1], a[19])
 886                 + MUL15(a[ 2], a[18])
 887                 + MUL15(a[ 3], a[17])
 888                 + MUL15(a[ 4], a[16])
 889                 + MUL15(a[ 5], a[15])
 890                 + MUL15(a[ 6], a[14])
 891                 + MUL15(a[ 7], a[13])
 892                 + MUL15(a[ 8], a[12])
 893                 + MUL15(a[ 9], a[11])) << 1);
 894         t[21] = ((MUL15(a[ 2], a[19])
 895                 + MUL15(a[ 3], a[18])
 896                 + MUL15(a[ 4], a[17])
 897                 + MUL15(a[ 5], a[16])
 898                 + MUL15(a[ 6], a[15])
 899                 + MUL15(a[ 7], a[14])
 900                 + MUL15(a[ 8], a[13])
 901                 + MUL15(a[ 9], a[12])
 902                 + MUL15(a[10], a[11])) << 1);
 903         t[22] = MUL15(a[11], a[11])
 904                 + ((MUL15(a[ 3], a[19])
 905                 + MUL15(a[ 4], a[18])
 906                 + MUL15(a[ 5], a[17])
 907                 + MUL15(a[ 6], a[16])
 908                 + MUL15(a[ 7], a[15])
 909                 + MUL15(a[ 8], a[14])
 910                 + MUL15(a[ 9], a[13])
 911                 + MUL15(a[10], a[12])) << 1);
 912         t[23] = ((MUL15(a[ 4], a[19])
 913                 + MUL15(a[ 5], a[18])
 914                 + MUL15(a[ 6], a[17])
 915                 + MUL15(a[ 7], a[16])
 916                 + MUL15(a[ 8], a[15])
 917                 + MUL15(a[ 9], a[14])
 918                 + MUL15(a[10], a[13])
 919                 + MUL15(a[11], a[12])) << 1);
 920         t[24] = MUL15(a[12], a[12])
 921                 + ((MUL15(a[ 5], a[19])
 922                 + MUL15(a[ 6], a[18])
 923                 + MUL15(a[ 7], a[17])
 924                 + MUL15(a[ 8], a[16])
 925                 + MUL15(a[ 9], a[15])
 926                 + MUL15(a[10], a[14])
 927                 + MUL15(a[11], a[13])) << 1);
 928         t[25] = ((MUL15(a[ 6], a[19])
 929                 + MUL15(a[ 7], a[18])
 930                 + MUL15(a[ 8], a[17])
 931                 + MUL15(a[ 9], a[16])
 932                 + MUL15(a[10], a[15])
 933                 + MUL15(a[11], a[14])
 934                 + MUL15(a[12], a[13])) << 1);
 935         t[26] = MUL15(a[13], a[13])
 936                 + ((MUL15(a[ 7], a[19])
 937                 + MUL15(a[ 8], a[18])
 938                 + MUL15(a[ 9], a[17])
 939                 + MUL15(a[10], a[16])
 940                 + MUL15(a[11], a[15])
 941                 + MUL15(a[12], a[14])) << 1);
 942         t[27] = ((MUL15(a[ 8], a[19])
 943                 + MUL15(a[ 9], a[18])
 944                 + MUL15(a[10], a[17])
 945                 + MUL15(a[11], a[16])
 946                 + MUL15(a[12], a[15])
 947                 + MUL15(a[13], a[14])) << 1);
 948         t[28] = MUL15(a[14], a[14])
 949                 + ((MUL15(a[ 9], a[19])
 950                 + MUL15(a[10], a[18])
 951                 + MUL15(a[11], a[17])
 952                 + MUL15(a[12], a[16])
 953                 + MUL15(a[13], a[15])) << 1);
 954         t[29] = ((MUL15(a[10], a[19])
 955                 + MUL15(a[11], a[18])
 956                 + MUL15(a[12], a[17])
 957                 + MUL15(a[13], a[16])
 958                 + MUL15(a[14], a[15])) << 1);
 959         t[30] = MUL15(a[15], a[15])
 960                 + ((MUL15(a[11], a[19])
 961                 + MUL15(a[12], a[18])
 962                 + MUL15(a[13], a[17])
 963                 + MUL15(a[14], a[16])) << 1);
 964         t[31] = ((MUL15(a[12], a[19])
 965                 + MUL15(a[13], a[18])
 966                 + MUL15(a[14], a[17])
 967                 + MUL15(a[15], a[16])) << 1);
 968         t[32] = MUL15(a[16], a[16])
 969                 + ((MUL15(a[13], a[19])
 970                 + MUL15(a[14], a[18])
 971                 + MUL15(a[15], a[17])) << 1);
 972         t[33] = ((MUL15(a[14], a[19])
 973                 + MUL15(a[15], a[18])
 974                 + MUL15(a[16], a[17])) << 1);
 975         t[34] = MUL15(a[17], a[17])
 976                 + ((MUL15(a[15], a[19])
 977                 + MUL15(a[16], a[18])) << 1);
 978         t[35] = ((MUL15(a[16], a[19])
 979                 + MUL15(a[17], a[18])) << 1);
 980         t[36] = MUL15(a[18], a[18])
 981                 + ((MUL15(a[17], a[19])) << 1);
 982         t[37] = ((MUL15(a[18], a[19])) << 1);
 983         t[38] = MUL15(a[19], a[19]);
 984         d[39] = norm13(d, t, 39);
 985 }
 986
 987 #endif
 988
 989 /*
 990  * Modulus for field F256 (field for point coordinates in curve P-256).
 991  */
 992 static const uint32_t F256[] = {
 993         0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
 994         0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
 995         0x0000, 0x1FF8, 0x1FFF, 0x01FF
 996 };
 997
 998 /*
 999  * The 'b' curve equation coefficient for P-256.
1000  */
1001 static const uint32_t P256_B[] = {
1002         0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
1003         0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
1004         0x0A3A, 0x0EC5, 0x118D, 0x00B5
1005 };
1006
1007 /*
1008  * Perform a "short reduction" in field F256 (field for curve P-256).
1009  * The source value should be less than 262 bits; on output, it will
1010  * be at most 257 bits, and less than twice the modulus.
1011  */
1012 static void
1013 reduce_f256(uint32_t *d)
1014 {
1015         uint32_t x;
1016
1017         x = d[19] >> 9;
1018         d[19] &= 0x01FF;
1019         d[17] += x << 3;
1020         d[14] -= x << 10;
1021         d[7] -= x << 5;
1022         d[0] += x;
1023         norm13(d, d, 20);
1024 }
1025
1026 /*
1027  * Perform a "final reduction" in field F256 (field for curve P-256).
1028  * The source value must be less than twice the modulus. If the value
1029  * is not lower than the modulus, then the modulus is subtracted and
1030  * this function returns 1; otherwise, it leaves it untouched and it
1031  * returns 0.
1032  */
1033 static uint32_t
1034 reduce_final_f256(uint32_t *d)
1035 {
1036         uint32_t t[20];
1037         uint32_t cc;
1038         int i;
1039
1040         memcpy(t, d, sizeof t);
1041         cc = 0;
1042         for (i = 0; i < 20; i ++) {
1043                 uint32_t w;
1044
1045                 w = t[i] - F256[i] - cc;
1046                 cc = w >> 31;
1047                 t[i] = w & 0x1FFF;
1048         }
1049         cc ^= 1;
1050         CCOPY(cc, d, t, sizeof t);
1051         return cc;
1052 }
1053
1054 /*
1055  * Perform a multiplication of two integers modulo
1056  * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
1057  * of 20 words, each containing 13 bits of data, in little-endian order.
1058  * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
1059  * on output, value fits on 257 bits and is lower than twice the modulus.
1060  */
1061 static void
1062 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
1063 {
1064         uint32_t t[40], cc;
1065         int i;
1066
1067         /*
1068          * Compute raw multiplication. All result words fit in 13 bits
1069          * each.
1070          */
1071         mul20(t, a, b);
1072
1073         /*
1074          * Modular reduction: each high word in added/subtracted where
1075          * necessary.
1076          *
1077          * The modulus is:
1078          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1079          * Therefore:
1080          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1081          *
1082          * For a word x at bit offset n (n >= 256), we have:
1083          *    x*2^n = x*2^(n-32) - x*2^(n-64)
1084          *            - x*2^(n - 160) + x*2^(n-256) mod p
1085          *
1086          * Thus, we can nullify the high word if we reinject it at some
1087          * proper emplacements.
1088          */
1089         for (i = 39; i >= 20; i --) {
1090                 uint32_t x;
1091
1092                 x = t[i];
1093                 t[i - 2] += ARSH(x, 6);
1094                 t[i - 3] += (x << 7) & 0x1FFF;
1095                 t[i - 4] -= ARSH(x, 12);
1096                 t[i - 5] -= (x << 1) & 0x1FFF;
1097                 t[i - 12] -= ARSH(x, 4);
1098                 t[i - 13] -= (x << 9) & 0x1FFF;
1099                 t[i - 19] += ARSH(x, 9);
1100                 t[i - 20] += (x << 4) & 0x1FFF;
1101         }
1102
1103         /*
1104          * Propagate carries. This is a signed propagation, and the
1105          * result may be negative. The loop above may enlarge values,
1106          * but not two much: worst case is the chain involving t[i - 3],
1107          * in which a value may be added to itself up to 7 times. Since
1108          * starting values are 13-bit each, all words fit on 20 bits
1109          * (21 to account for the sign bit).
1110          */
1111         cc = norm13(t, t, 20);
1112
1113         /*
1114          * Perform modular reduction again for the bits beyond 256 (the carry
1115          * and the bits 256..259). Since the largest shift below is by 10
1116          * bits, and the values fit on 21 bits, values fit in 32-bit words,
1117          * thereby allowing injecting full word values.
1118          */
1119         cc = (cc << 4) | (t[19] >> 9);
1120         t[19] &= 0x01FF;
1121         t[17] += cc << 3;
1122         t[14] -= cc << 10;
1123         t[7] -= cc << 5;
1124         t[0] += cc;
1125
1126         /*
1127          * If the carry is negative, then after carry propagation, we may
1128          * end up with a value which is negative, and we don't want that.
1129          * Thus, in that case, we add the modulus. Note that the subtraction
1130          * result, when the carry is negative, is always smaller than the
1131          * modulus, so the extra addition will not make the value exceed
1132          * twice the modulus.
1133          */
1134         cc >>= 31;
1135         t[0] -= cc;
1136         t[7] += cc << 5;
1137         t[14] += cc << 10;
1138         t[17] -= cc << 3;
1139         t[19] += cc << 9;
1140
1141         norm13(d, t, 20);
1142 }
1143
1144 /*
1145  * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
1146  * P-256). Operand is an array of 20 words, each containing 13 bits of
1147  * data, in little-endian order. On input, upper word may be up to 13
1148  * bits (hence value up to 2^260-1); on output, value fits on 257 bits
1149  * and is lower than twice the modulus.
1150  */
1151 static void
1152 square_f256(uint32_t *d, const uint32_t *a)
1153 {
1154         uint32_t t[40], cc;
1155         int i;
1156
1157         /*
1158          * Compute raw square. All result words fit in 13 bits each.
1159          */
1160         square20(t, a);
1161
1162         /*
1163          * Modular reduction: each high word in added/subtracted where
1164          * necessary.
1165          *
1166          * The modulus is:
1167          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1168          * Therefore:
1169          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1170          *
1171          * For a word x at bit offset n (n >= 256), we have:
1172          *    x*2^n = x*2^(n-32) - x*2^(n-64)
1173          *            - x*2^(n - 160) + x*2^(n-256) mod p
1174          *
1175          * Thus, we can nullify the high word if we reinject it at some
1176          * proper emplacements.
1177          */
1178         for (i = 39; i >= 20; i --) {
1179                 uint32_t x;
1180
1181                 x = t[i];
1182                 t[i - 2] += ARSH(x, 6);
1183                 t[i - 3] += (x << 7) & 0x1FFF;
1184                 t[i - 4] -= ARSH(x, 12);
1185                 t[i - 5] -= (x << 1) & 0x1FFF;
1186                 t[i - 12] -= ARSH(x, 4);
1187                 t[i - 13] -= (x << 9) & 0x1FFF;
1188                 t[i - 19] += ARSH(x, 9);
1189                 t[i - 20] += (x << 4) & 0x1FFF;
1190         }
1191
1192         /*
1193          * Propagate carries. This is a signed propagation, and the
1194          * result may be negative. The loop above may enlarge values,
1195          * but not two much: worst case is the chain involving t[i - 3],
1196          * in which a value may be added to itself up to 7 times. Since
1197          * starting values are 13-bit each, all words fit on 20 bits
1198          * (21 to account for the sign bit).
1199          */
1200         cc = norm13(t, t, 20);
1201
1202         /*
1203          * Perform modular reduction again for the bits beyond 256 (the carry
1204          * and the bits 256..259). Since the largest shift below is by 10
1205          * bits, and the values fit on 21 bits, values fit in 32-bit words,
1206          * thereby allowing injecting full word values.
1207          */
1208         cc = (cc << 4) | (t[19] >> 9);
1209         t[19] &= 0x01FF;
1210         t[17] += cc << 3;
1211         t[14] -= cc << 10;
1212         t[7] -= cc << 5;
1213         t[0] += cc;
1214
1215         /*
1216          * If the carry is negative, then after carry propagation, we may
1217          * end up with a value which is negative, and we don't want that.
1218          * Thus, in that case, we add the modulus. Note that the subtraction
1219          * result, when the carry is negative, is always smaller than the
1220          * modulus, so the extra addition will not make the value exceed
1221          * twice the modulus.
1222          */
1223         cc >>= 31;
1224         t[0] -= cc;
1225         t[7] += cc << 5;
1226         t[14] += cc << 10;
1227         t[17] -= cc << 3;
1228         t[19] += cc << 9;
1229
1230         norm13(d, t, 20);
1231 }
1232
1233 /*
1234  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
1235  * are such that:
1236  *   X = x / z^2
1237  *   Y = y / z^3
1238  * For the point at infinity, z = 0.
1239  * Each point thus admits many possible representations.
1240  *
1241  * Coordinates are represented in arrays of 32-bit integers, each holding
1242  * 13 bits of data. Values may also be slightly greater than the modulus,
1243  * but they will always be lower than twice the modulus.
1244  */
1245 typedef struct {
1246         uint32_t x[20];
1247         uint32_t y[20];
1248         uint32_t z[20];
1249 } p256_jacobian;
1250
1251 /*
1252  * Convert a point to affine coordinates:
1253  *  - If the point is the point at infinity, then all three coordinates
1254  *    are set to 0.
1255  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
1256  *    coordinates are the 'X' and 'Y' affine coordinates.
1257  * The coordinates are guaranteed to be lower than the modulus.
1258  */
1259 static void
1260 p256_to_affine(p256_jacobian *P)
1261 {
1262         uint32_t t1[20], t2[20];
1263         int i;
1264
1265         /*
1266          * Invert z with a modular exponentiation: the modulus is
1267          * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
1268          * p-2. Exponent bit pattern (from high to low) is:
1269          *  - 32 bits of value 1
1270          *  - 31 bits of value 0
1271          *  - 1 bit of value 1
1272          *  - 96 bits of value 0
1273          *  - 94 bits of value 1
1274          *  - 1 bit of value 0
1275          *  - 1 bit of value 1
1276          * Thus, we precompute z^(2^31-1) to speed things up.
1277          *
1278          * If z = 0 (point at infinity) then the modular exponentiation
1279          * will yield 0, which leads to the expected result (all three
1280          * coordinates set to 0).
1281          */
1282
1283         /*
1284          * A simple square-and-multiply for z^(2^31-1). We could save about
1285          * two dozen multiplications here with an addition chain, but
1286          * this would require a bit more code, and extra stack buffers.
1287          */
1288         memcpy(t1, P->z, sizeof P->z);
1289         for (i = 0; i < 30; i ++) {
1290                 square_f256(t1, t1);
1291                 mul_f256(t1, t1, P->z);
1292         }
1293
1294         /*
1295          * Square-and-multiply. Apart from the squarings, we have a few
1296          * multiplications to set bits to 1; we multiply by the original z
1297          * for setting 1 bit, and by t1 for setting 31 bits.
1298          */
1299         memcpy(t2, P->z, sizeof P->z);
1300         for (i = 1; i < 256; i ++) {
1301                 square_f256(t2, t2);
1302                 switch (i) {
1303                 case 31:
1304                 case 190:
1305                 case 221:
1306                 case 252:
1307                         mul_f256(t2, t2, t1);
1308                         break;
1309                 case 63:
1310                 case 253:
1311                 case 255:
1312                         mul_f256(t2, t2, P->z);
1313                         break;
1314                 }
1315         }
1316
1317         /*
1318          * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
1319          */
1320         mul_f256(t1, t2, t2);
1321         mul_f256(P->x, t1, P->x);
1322         mul_f256(t1, t1, t2);
1323         mul_f256(P->y, t1, P->y);
1324         reduce_final_f256(P->x);
1325         reduce_final_f256(P->y);
1326
1327         /*
1328          * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
1329          * this will set z to 1.
1330          */
1331         mul_f256(P->z, P->z, t2);
1332         reduce_final_f256(P->z);
1333 }
1334
1335 /*
1336  * Double a point in P-256. This function works for all valid points,
1337  * including the point at infinity.
1338  */
1339 static void
1340 p256_double(p256_jacobian *Q)
1341 {
1342         /*
1343          * Doubling formulas are:
1344          *
1345          *   s = 4*x*y^2
1346          *   m = 3*(x + z^2)*(x - z^2)
1347          *   x' = m^2 - 2*s
1348          *   y' = m*(s - x') - 8*y^4
1349          *   z' = 2*y*z
1350          *
1351          * These formulas work for all points, including points of order 2
1352          * and points at infinity:
1353          *   - If y = 0 then z' = 0. But there is no such point in P-256
1354          *     anyway.
1355          *   - If z = 0 then z' = 0.
1356          */
1357         uint32_t t1[20], t2[20], t3[20], t4[20];
1358         int i;
1359
1360         /*
1361          * Compute z^2 in t1.
1362          */
1363         square_f256(t1, Q->z);
1364
1365         /*
1366          * Compute x-z^2 in t2 and x+z^2 in t1.
1367          */
1368         for (i = 0; i < 20; i ++) {
1369                 t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1370                 t1[i] += Q->x[i];
1371         }
1372         norm13(t1, t1, 20);
1373         norm13(t2, t2, 20);
1374
1375         /*
1376          * Compute 3*(x+z^2)*(x-z^2) in t1.
1377          */
1378         mul_f256(t3, t1, t2);
1379         for (i = 0; i < 20; i ++) {
1380                 t1[i] = MUL15(3, t3[i]);
1381         }
1382         norm13(t1, t1, 20);
1383
1384         /*
1385          * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1386          */
1387         square_f256(t3, Q->y);
1388         for (i = 0; i < 20; i ++) {
1389                 t3[i] <<= 1;
1390         }
1391         norm13(t3, t3, 20);
1392         mul_f256(t2, Q->x, t3);
1393         for (i = 0; i < 20; i ++) {
1394                 t2[i] <<= 1;
1395         }
1396         norm13(t2, t2, 20);
1397         reduce_f256(t2);
1398
1399         /*
1400          * Compute x' = m^2 - 2*s.
1401          */
1402         square_f256(Q->x, t1);
1403         for (i = 0; i < 20; i ++) {
1404                 Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1405         }
1406         norm13(Q->x, Q->x, 20);
1407         reduce_f256(Q->x);
1408
1409         /*
1410          * Compute z' = 2*y*z.
1411          */
1412         mul_f256(t4, Q->y, Q->z);
1413         for (i = 0; i < 20; i ++) {
1414                 Q->z[i] = t4[i] << 1;
1415         }
1416         norm13(Q->z, Q->z, 20);
1417         reduce_f256(Q->z);
1418
1419         /*
1420          * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1421          * 2*y^2 in t3.
1422          */
1423         for (i = 0; i < 20; i ++) {
1424                 t2[i] += (F256[i] << 1) - Q->x[i];
1425         }
1426         norm13(t2, t2, 20);
1427         mul_f256(Q->y, t1, t2);
1428         square_f256(t4, t3);
1429         for (i = 0; i < 20; i ++) {
1430                 Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1431         }
1432         norm13(Q->y, Q->y, 20);
1433         reduce_f256(Q->y);
1434 }
1435
1436 /*
1437  * Add point P2 to point P1.
1438  *
1439  * This function computes the wrong result in the following cases:
1440  *
1441  *   - If P1 == 0 but P2 != 0
1442  *   - If P1 != 0 but P2 == 0
1443  *   - If P1 == P2
1444  *
1445  * In all three cases, P1 is set to the point at infinity.
1446  *
1447  * Returned value is 0 if one of the following occurs:
1448  *
1449  *   - P1 and P2 have the same Y coordinate
1450  *   - P1 == 0 and P2 == 0
1451  *   - The Y coordinate of one of the points is 0 and the other point is
1452  *     the point at infinity.
1453  *
1454  * The third case cannot actually happen with valid points, since a point
1455  * with Y == 0 is a point of order 2, and there is no point of order 2 on
1456  * curve P-256.
1457  *
1458  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1459  * can apply the following:
1460  *
1461  *   - If the result is not the point at infinity, then it is correct.
1462  *   - Otherwise, if the returned value is 1, then this is a case of
1463  *     P1+P2 == 0, so the result is indeed the point at infinity.
1464  *   - Otherwise, P1 == P2, so a "double" operation should have been
1465  *     performed.
1466  */
1467 static uint32_t
1468 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1469 {
1470         /*
1471          * Addtions formulas are:
1472          *
1473          *   u1 = x1 * z2^2
1474          *   u2 = x2 * z1^2
1475          *   s1 = y1 * z2^3
1476          *   s2 = y2 * z1^3
1477          *   h = u2 - u1
1478          *   r = s2 - s1
1479          *   x3 = r^2 - h^3 - 2 * u1 * h^2
1480          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1481          *   z3 = h * z1 * z2
1482          */
1483         uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1484         uint32_t ret;
1485         int i;
1486
1487         /*
1488          * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1489          */
1490         square_f256(t3, P2->z);
1491         mul_f256(t1, P1->x, t3);
1492         mul_f256(t4, P2->z, t3);
1493         mul_f256(t3, P1->y, t4);
1494
1495         /*
1496          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1497          */
1498         square_f256(t4, P1->z);
1499         mul_f256(t2, P2->x, t4);
1500         mul_f256(t5, P1->z, t4);
1501         mul_f256(t4, P2->y, t5);
1502
1503         /*
1504          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1505          * We need to test whether r is zero, so we will do some extra
1506          * reduce.
1507          */
1508         for (i = 0; i < 20; i ++) {
1509                 t2[i] += (F256[i] << 1) - t1[i];
1510                 t4[i] += (F256[i] << 1) - t3[i];
1511         }
1512         norm13(t2, t2, 20);
1513         norm13(t4, t4, 20);
1514         reduce_f256(t4);
1515         reduce_final_f256(t4);
1516         ret = 0;
1517         for (i = 0; i < 20; i ++) {
1518                 ret |= t4[i];
1519         }
1520         ret = (ret | -ret) >> 31;
1521
1522         /*
1523          * Compute u1*h^2 (in t6) and h^3 (in t5);
1524          */
1525         square_f256(t7, t2);
1526         mul_f256(t6, t1, t7);
1527         mul_f256(t5, t7, t2);
1528
1529         /*
1530          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1531          */
1532         square_f256(P1->x, t4);
1533         for (i = 0; i < 20; i ++) {
1534                 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1535         }
1536         norm13(P1->x, P1->x, 20);
1537         reduce_f256(P1->x);
1538
1539         /*
1540          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1541          */
1542         for (i = 0; i < 20; i ++) {
1543                 t6[i] += (F256[i] << 1) - P1->x[i];
1544         }
1545         norm13(t6, t6, 20);
1546         mul_f256(P1->y, t4, t6);
1547         mul_f256(t1, t5, t3);
1548         for (i = 0; i < 20; i ++) {
1549                 P1->y[i] += (F256[i] << 1) - t1[i];
1550         }
1551         norm13(P1->y, P1->y, 20);
1552         reduce_f256(P1->y);
1553
1554         /*
1555          * Compute z3 = h*z1*z2.
1556          */
1557         mul_f256(t1, P1->z, P2->z);
1558         mul_f256(P1->z, t1, t2);
1559
1560         return ret;
1561 }
1562
1563 /*
1564  * Add point P2 to point P1. This is a specialised function for the
1565  * case when P2 is a non-zero point in affine coordinate.
1566  *
1567  * This function computes the wrong result in the following cases:
1568  *
1569  *   - If P1 == 0
1570  *   - If P1 == P2
1571  *
1572  * In both cases, P1 is set to the point at infinity.
1573  *
1574  * Returned value is 0 if one of the following occurs:
1575  *
1576  *   - P1 and P2 have the same Y coordinate
1577  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
1578  *
1579  * The second case cannot actually happen with valid points, since a point
1580  * with Y == 0 is a point of order 2, and there is no point of order 2 on
1581  * curve P-256.
1582  *
1583  * Therefore, assuming that P1 != 0 on input, then the caller
1584  * can apply the following:
1585  *
1586  *   - If the result is not the point at infinity, then it is correct.
1587  *   - Otherwise, if the returned value is 1, then this is a case of
1588  *     P1+P2 == 0, so the result is indeed the point at infinity.
1589  *   - Otherwise, P1 == P2, so a "double" operation should have been
1590  *     performed.
1591  */
1592 static uint32_t
1593 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
1594 {
1595         /*
1596          * Addtions formulas are:
1597          *
1598          *   u1 = x1
1599          *   u2 = x2 * z1^2
1600          *   s1 = y1
1601          *   s2 = y2 * z1^3
1602          *   h = u2 - u1
1603          *   r = s2 - s1
1604          *   x3 = r^2 - h^3 - 2 * u1 * h^2
1605          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1606          *   z3 = h * z1
1607          */
1608         uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1609         uint32_t ret;
1610         int i;
1611
1612         /*
1613          * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1614          */
1615         memcpy(t1, P1->x, sizeof t1);
1616         memcpy(t3, P1->y, sizeof t3);
1617
1618         /*
1619          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1620          */
1621         square_f256(t4, P1->z);
1622         mul_f256(t2, P2->x, t4);
1623         mul_f256(t5, P1->z, t4);
1624         mul_f256(t4, P2->y, t5);
1625
1626         /*
1627          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1628          * We need to test whether r is zero, so we will do some extra
1629          * reduce.
1630          */
1631         for (i = 0; i < 20; i ++) {
1632                 t2[i] += (F256[i] << 1) - t1[i];
1633                 t4[i] += (F256[i] << 1) - t3[i];
1634         }
1635         norm13(t2, t2, 20);
1636         norm13(t4, t4, 20);
1637         reduce_f256(t4);
1638         reduce_final_f256(t4);
1639         ret = 0;
1640         for (i = 0; i < 20; i ++) {
1641                 ret |= t4[i];
1642         }
1643         ret = (ret | -ret) >> 31;
1644
1645         /*
1646          * Compute u1*h^2 (in t6) and h^3 (in t5);
1647          */
1648         square_f256(t7, t2);
1649         mul_f256(t6, t1, t7);
1650         mul_f256(t5, t7, t2);
1651
1652         /*
1653          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1654          */
1655         square_f256(P1->x, t4);
1656         for (i = 0; i < 20; i ++) {
1657                 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1658         }
1659         norm13(P1->x, P1->x, 20);
1660         reduce_f256(P1->x);
1661
1662         /*
1663          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1664          */
1665         for (i = 0; i < 20; i ++) {
1666                 t6[i] += (F256[i] << 1) - P1->x[i];
1667         }
1668         norm13(t6, t6, 20);
1669         mul_f256(P1->y, t4, t6);
1670         mul_f256(t1, t5, t3);
1671         for (i = 0; i < 20; i ++) {
1672                 P1->y[i] += (F256[i] << 1) - t1[i];
1673         }
1674         norm13(P1->y, P1->y, 20);
1675         reduce_f256(P1->y);
1676
1677         /*
1678          * Compute z3 = h*z1*z2.
1679          */
1680         mul_f256(P1->z, P1->z, t2);
1681
1682         return ret;
1683 }
1684
1685 /*
1686  * Decode a P-256 point. This function does not support the point at
1687  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1688  */
1689 static uint32_t
1690 p256_decode(p256_jacobian *P, const void *src, size_t len)
1691 {
1692         const unsigned char *buf;
1693         uint32_t tx[20], ty[20], t1[20], t2[20];
1694         uint32_t bad;
1695         int i;
1696
1697         if (len != 65) {
1698                 return 0;
1699         }
1700         buf = src;
1701
1702         /*
1703          * First byte must be 0x04 (uncompressed format). We could support
1704          * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1705          * least significant bit of the Y coordinate), but it is explicitly
1706          * forbidden by RFC 5480 (section 2.2).
1707          */
1708         bad = NEQ(buf[0], 0x04);
1709
1710         /*
1711          * Decode the coordinates, and check that they are both lower
1712          * than the modulus.
1713          */
1714         tx[19] = be8_to_le13(tx, buf + 1, 32);
1715         ty[19] = be8_to_le13(ty, buf + 33, 32);
1716         bad |= reduce_final_f256(tx);
1717         bad |= reduce_final_f256(ty);
1718
1719         /*
1720          * Check curve equation.
1721          */
1722         square_f256(t1, tx);
1723         mul_f256(t1, tx, t1);
1724         square_f256(t2, ty);
1725         for (i = 0; i < 20; i ++) {
1726                 t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1727         }
1728         norm13(t1, t1, 20);
1729         reduce_f256(t1);
1730         reduce_final_f256(t1);
1731         for (i = 0; i < 20; i ++) {
1732                 bad |= t1[i];
1733         }
1734
1735         /*
1736          * Copy coordinates to the point structure.
1737          */
1738         memcpy(P->x, tx, sizeof tx);
1739         memcpy(P->y, ty, sizeof ty);
1740         memset(P->z, 0, sizeof P->z);
1741         P->z[0] = 1;
1742         return EQ(bad, 0);
1743 }
1744
1745 /*
1746  * Encode a point into a buffer. This function assumes that the point is
1747  * valid, in affine coordinates, and not the point at infinity.
1748  */
1749 static void
1750 p256_encode(void *dst, const p256_jacobian *P)
1751 {
1752         unsigned char *buf;
1753
1754         buf = dst;
1755         buf[0] = 0x04;
1756         le13_to_be8(buf + 1, 32, P->x);
1757         le13_to_be8(buf + 33, 32, P->y);
1758 }
1759
1760 /*
1761  * Multiply a curve point by an integer. The integer is assumed to be
1762  * lower than the curve order, and the base point must not be the point
1763  * at infinity.
1764  */
1765 static void
1766 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1767 {
1768         /*
1769          * qz is a flag that is initially 1, and remains equal to 1
1770          * as long as the point is the point at infinity.
1771          *
1772          * We use a 2-bit window to handle multiplier bits by pairs.
1773          * The precomputed window really is the points P2 and P3.
1774          */
1775         uint32_t qz;
1776         p256_jacobian P2, P3, Q, T, U;
1777
1778         /*
1779          * Compute window values.
1780          */
1781         P2 = *P;
1782         p256_double(&P2);
1783         P3 = *P;
1784         p256_add(&P3, &P2);
1785
1786         /*
1787          * We start with Q = 0. We process multiplier bits 2 by 2.
1788          */
1789         memset(&Q, 0, sizeof Q);
1790         qz = 1;
1791         while (xlen -- > 0) {
1792                 int k;
1793
1794                 for (k = 6; k >= 0; k -= 2) {
1795                         uint32_t bits;
1796                         uint32_t bnz;
1797
1798                         p256_double(&Q);
1799                         p256_double(&Q);
1800                         T = *P;
1801                         U = Q;
1802                         bits = (*x >> k) & (uint32_t)3;
1803                         bnz = NEQ(bits, 0);
1804                         CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1805                         CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1806                         p256_add(&U, &T);
1807                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1808                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1809                         qz &= ~bnz;
1810                 }
1811                 x ++;
1812         }
1813         *P = Q;
1814 }
1815
1816 /*
1817  * Precomputed window: k*G points, where G is the curve generator, and k
1818  * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1819  * the point are encoded as 20 words of 13 bits each (little-endian
1820  * order); 13-bit words are then grouped 2-by-2 into 32-bit words
1821  * (little-endian order within each word).
1822  */
1823 static const uint32_t Gwin[15][20] = {
1824
1825         { 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
1826           0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
1827           0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
1828           0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
1829           0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
1830
1831         { 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
1832           0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
1833           0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
1834           0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
1835           0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
1836
1837         { 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
1838           0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
1839           0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
1840           0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
1841           0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
1842
1843         { 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
1844           0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
1845           0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
1846           0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
1847           0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
1848
1849         { 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
1850           0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
1851           0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
1852           0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
1853           0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
1854
1855         { 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
1856           0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
1857           0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
1858           0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
1859           0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
1860
1861         { 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
1862           0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
1863           0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
1864           0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
1865           0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
1866
1867         { 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
1868           0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
1869           0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
1870           0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
1871           0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
1872
1873         { 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
1874           0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
1875           0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
1876           0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
1877           0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
1878
1879         { 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
1880           0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
1881           0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
1882           0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
1883           0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
1884
1885         { 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
1886           0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
1887           0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
1888           0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
1889           0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
1890
1891         { 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
1892           0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
1893           0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
1894           0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
1895           0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
1896
1897         { 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
1898           0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
1899           0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
1900           0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
1901           0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
1902
1903         { 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
1904           0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
1905           0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
1906           0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
1907           0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
1908
1909         { 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
1910           0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
1911           0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
1912           0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
1913           0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
1914 };
1915
1916 /*
1917  * Lookup one of the Gwin[] values, by index. This is constant-time.
1918  */
1919 static void
1920 lookup_Gwin(p256_jacobian *T, uint32_t idx)
1921 {
1922         uint32_t xy[20];
1923         uint32_t k;
1924         size_t u;
1925
1926         memset(xy, 0, sizeof xy);
1927         for (k = 0; k < 15; k ++) {
1928                 uint32_t m;
1929
1930                 m = -EQ(idx, k + 1);
1931                 for (u = 0; u < 20; u ++) {
1932                         xy[u] |= m & Gwin[k][u];
1933                 }
1934         }
1935         for (u = 0; u < 10; u ++) {
1936                 T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
1937                 T->x[(u << 1) + 1] = xy[u] >> 16;
1938                 T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
1939                 T->y[(u << 1) + 1] = xy[u + 10] >> 16;
1940         }
1941         memset(T->z, 0, sizeof T->z);
1942         T->z[0] = 1;
1943 }
1944
1945 /*
1946  * Multiply the generator by an integer. The integer is assumed non-zero
1947  * and lower than the curve order.
1948  */
1949 static void
1950 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1951 {
1952         /*
1953          * qz is a flag that is initially 1, and remains equal to 1
1954          * as long as the point is the point at infinity.
1955          *
1956          * We use a 4-bit window to handle multiplier bits by groups
1957          * of 4. The precomputed window is constant static data, with
1958          * points in affine coordinates; we use a constant-time lookup.
1959          */
1960         p256_jacobian Q;
1961         uint32_t qz;
1962
1963         memset(&Q, 0, sizeof Q);
1964         qz = 1;
1965         while (xlen -- > 0) {
1966                 int k;
1967                 unsigned bx;
1968
1969                 bx = *x ++;
1970                 for (k = 0; k < 2; k ++) {
1971                         uint32_t bits;
1972                         uint32_t bnz;
1973                         p256_jacobian T, U;
1974
1975                         p256_double(&Q);
1976                         p256_double(&Q);
1977                         p256_double(&Q);
1978                         p256_double(&Q);
1979                         bits = (bx >> 4) & 0x0F;
1980                         bnz = NEQ(bits, 0);
1981                         lookup_Gwin(&T, bits);
1982                         U = Q;
1983                         p256_add_mixed(&U, &T);
1984                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1985                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1986                         qz &= ~bnz;
1987                         bx <<= 4;
1988                 }
1989         }
1990         *P = Q;
1991 }
1992
1993 static const unsigned char P256_G[] = {
1994         0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1995         0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1996         0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1997         0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1998         0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1999         0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
2000         0x68, 0x37, 0xBF, 0x51, 0xF5
2001 };
2002
2003 static const unsigned char P256_N[] = {
2004         0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
2005         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
2006         0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
2007         0x25, 0x51
2008 };
2009
2010 static const unsigned char *
2011 api_generator(int curve, size_t *len)
2012 {
2013         (void)curve;
2014         *len = sizeof P256_G;
2015         return P256_G;
2016 }
2017
2018 static const unsigned char *
2019 api_order(int curve, size_t *len)
2020 {
2021         (void)curve;
2022         *len = sizeof P256_N;
2023         return P256_N;
2024 }
2025
2026 static size_t
2027 api_xoff(int curve, size_t *len)
2028 {
2029         (void)curve;
2030         *len = 32;
2031         return 1;
2032 }
2033
2034 static uint32_t
2035 api_mul(unsigned char *G, size_t Glen,
2036         const unsigned char *x, size_t xlen, int curve)
2037 {
2038         uint32_t r;
2039         p256_jacobian P;
2040
2041         (void)curve;
2042         r = p256_decode(&P, G, Glen);
2043         p256_mul(&P, x, xlen);
2044         if (Glen >= 65) {
2045                 p256_to_affine(&P);
2046                 p256_encode(G, &P);
2047         }
2048         return r;
2049 }
2050
2051 static size_t
2052 api_mulgen(unsigned char *R,
2053         const unsigned char *x, size_t xlen, int curve)
2054 {
2055         p256_jacobian P;
2056
2057         (void)curve;
2058         p256_mulgen(&P, x, xlen);
2059         p256_to_affine(&P);
2060         p256_encode(R, &P);
2061         return 65;
2062
2063         /*
2064         const unsigned char *G;
2065         size_t Glen;
2066
2067         G = api_generator(curve, &Glen);
2068         memcpy(R, G, Glen);
2069         api_mul(R, Glen, x, xlen, curve);
2070         return Glen;
2071         */
2072 }
2073
2074 static uint32_t
2075 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
2076         const unsigned char *x, size_t xlen,
2077         const unsigned char *y, size_t ylen, int curve)
2078 {
2079         p256_jacobian P, Q;
2080         uint32_t r, t, z;
2081         int i;
2082
2083         (void)curve;
2084         r = p256_decode(&P, A, len);
2085         p256_mul(&P, x, xlen);
2086         if (B == NULL) {
2087                 p256_mulgen(&Q, y, ylen);
2088         } else {
2089                 r &= p256_decode(&Q, B, len);
2090                 p256_mul(&Q, y, ylen);
2091         }
2092
2093         /*
2094          * The final addition may fail in case both points are equal.
2095          */
2096         t = p256_add(&P, &Q);
2097         reduce_final_f256(P.z);
2098         z = 0;
2099         for (i = 0; i < 20; i ++) {
2100                 z |= P.z[i];
2101         }
2102         z = EQ(z, 0);
2103         p256_double(&Q);
2104
2105         /*
2106          * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
2107          * have the following:
2108          *
2109          *   z = 0, t = 0   return P (normal addition)
2110          *   z = 0, t = 1   return P (normal addition)
2111          *   z = 1, t = 0   return Q (a 'double' case)
2112          *   z = 1, t = 1   report an error (P+Q = 0)
2113          */
2114         CCOPY(z & ~t, &P, &Q, sizeof Q);
2115         p256_to_affine(&P);
2116         p256_encode(A, &P);
2117         r &= ~(z & t);
2118         return r;
2119 }
2120
2121 /* see bearssl_ec.h */
2122 const br_ec_impl br_ec_p256_m15 = {
2123         (uint32_t)0x00800000,
2124         &api_generator,
2125         &api_order,
2126         &api_xoff,
2127         &api_mul,
2128         &api_mulgen,
2129         &api_muladd
2130 };