crypto/openssl/crypto/ec/ecp_nistp521.c

   1 /* crypto/ec/ecp_nistp521.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 # ifndef OPENSSL_SYS_VMS
  33 #  include <stdint.h>
  34 # else
  35 #  include <inttypes.h>
  36 # endif
  37
  38 # include <string.h>
  39 # include <openssl/err.h>
  40 # include "ec_lcl.h"
  41
  42 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  43   /* even with gcc, the typedef won't work for 32-bit platforms */
  44 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  45                                  * platforms */
  46 # else
  47 #  error "Need GCC 3.1 or later to define type uint128_t"
  48 # endif
  49
  50 typedef uint8_t u8;
  51 typedef uint64_t u64;
  52
  53 /*
  54  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  55  * element of this field into 66 bytes where the most significant byte
  56  * contains only a single bit. We call this an felem_bytearray.
  57  */
  58
  59 typedef u8 felem_bytearray[66];
  60
  61 /*
  62  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  63  * These values are big-endian.
  64  */
  65 static const felem_bytearray nistp521_curve_params[5] = {
  66     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  67      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff},
  75     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  76      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xfc},
  84     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  85      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  86      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  87      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  88      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  89      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  90      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  91      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  92      0x3f, 0x00},
  93     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  94      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  95      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  96      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  97      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  98      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
  99      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 100      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 101      0xbd, 0x66},
 102     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 103      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 104      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 105      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 106      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 107      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 108      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 109      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 110      0x66, 0x50}
 111 };
 112
 113 /*-
 114  * The representation of field elements.
 115  * ------------------------------------
 116  *
 117  * We represent field elements with nine values. These values are either 64 or
 118  * 128 bits and the field element represented is:
 119  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 120  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 121  * 58 bits apart, but are greater than 58 bits in length, the most significant
 122  * bits of each limb overlap with the least significant bits of the next.
 123  *
 124  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 125  * 'largefelem' */
 126
 127 # define NLIMBS 9
 128
 129 typedef uint64_t limb;
 130 typedef limb felem[NLIMBS];
 131 typedef uint128_t largefelem[NLIMBS];
 132
 133 static const limb bottom57bits = 0x1ffffffffffffff;
 134 static const limb bottom58bits = 0x3ffffffffffffff;
 135
 136 /*
 137  * bin66_to_felem takes a little-endian byte array and converts it into felem
 138  * form. This assumes that the CPU is little-endian.
 139  */
 140 static void bin66_to_felem(felem out, const u8 in[66])
 141 {
 142     out[0] = (*((limb *) & in[0])) & bottom58bits;
 143     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 144     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 145     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 146     out[4] = (*((limb *) & in[29])) & bottom58bits;
 147     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 148     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 149     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 150     out[8] = (*((limb *) & in[58])) & bottom57bits;
 151 }
 152
 153 /*
 154  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 155  * array. This assumes that the CPU is little-endian.
 156  */
 157 static void felem_to_bin66(u8 out[66], const felem in)
 158 {
 159     memset(out, 0, 66);
 160     (*((limb *) & out[0])) = in[0];
 161     (*((limb *) & out[7])) |= in[1] << 2;
 162     (*((limb *) & out[14])) |= in[2] << 4;
 163     (*((limb *) & out[21])) |= in[3] << 6;
 164     (*((limb *) & out[29])) = in[4];
 165     (*((limb *) & out[36])) |= in[5] << 2;
 166     (*((limb *) & out[43])) |= in[6] << 4;
 167     (*((limb *) & out[50])) |= in[7] << 6;
 168     (*((limb *) & out[58])) = in[8];
 169 }
 170
 171 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 172 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 173 {
 174     unsigned i;
 175     for (i = 0; i < len; ++i)
 176         out[i] = in[len - 1 - i];
 177 }
 178
 179 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 180 static int BN_to_felem(felem out, const BIGNUM *bn)
 181 {
 182     felem_bytearray b_in;
 183     felem_bytearray b_out;
 184     unsigned num_bytes;
 185
 186     /* BN_bn2bin eats leading zeroes */
 187     memset(b_out, 0, sizeof(b_out));
 188     num_bytes = BN_num_bytes(bn);
 189     if (num_bytes > sizeof(b_out)) {
 190         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 191         return 0;
 192     }
 193     if (BN_is_negative(bn)) {
 194         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 195         return 0;
 196     }
 197     num_bytes = BN_bn2bin(bn, b_in);
 198     flip_endian(b_out, b_in, num_bytes);
 199     bin66_to_felem(out, b_out);
 200     return 1;
 201 }
 202
 203 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 204 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 205 {
 206     felem_bytearray b_in, b_out;
 207     felem_to_bin66(b_in, in);
 208     flip_endian(b_out, b_in, sizeof(b_out));
 209     return BN_bin2bn(b_out, sizeof(b_out), out);
 210 }
 211
 212 /*-
 213  * Field operations
 214  * ----------------
 215  */
 216
 217 static void felem_one(felem out)
 218 {
 219     out[0] = 1;
 220     out[1] = 0;
 221     out[2] = 0;
 222     out[3] = 0;
 223     out[4] = 0;
 224     out[5] = 0;
 225     out[6] = 0;
 226     out[7] = 0;
 227     out[8] = 0;
 228 }
 229
 230 static void felem_assign(felem out, const felem in)
 231 {
 232     out[0] = in[0];
 233     out[1] = in[1];
 234     out[2] = in[2];
 235     out[3] = in[3];
 236     out[4] = in[4];
 237     out[5] = in[5];
 238     out[6] = in[6];
 239     out[7] = in[7];
 240     out[8] = in[8];
 241 }
 242
 243 /* felem_sum64 sets out = out + in. */
 244 static void felem_sum64(felem out, const felem in)
 245 {
 246     out[0] += in[0];
 247     out[1] += in[1];
 248     out[2] += in[2];
 249     out[3] += in[3];
 250     out[4] += in[4];
 251     out[5] += in[5];
 252     out[6] += in[6];
 253     out[7] += in[7];
 254     out[8] += in[8];
 255 }
 256
 257 /* felem_scalar sets out = in * scalar */
 258 static void felem_scalar(felem out, const felem in, limb scalar)
 259 {
 260     out[0] = in[0] * scalar;
 261     out[1] = in[1] * scalar;
 262     out[2] = in[2] * scalar;
 263     out[3] = in[3] * scalar;
 264     out[4] = in[4] * scalar;
 265     out[5] = in[5] * scalar;
 266     out[6] = in[6] * scalar;
 267     out[7] = in[7] * scalar;
 268     out[8] = in[8] * scalar;
 269 }
 270
 271 /* felem_scalar64 sets out = out * scalar */
 272 static void felem_scalar64(felem out, limb scalar)
 273 {
 274     out[0] *= scalar;
 275     out[1] *= scalar;
 276     out[2] *= scalar;
 277     out[3] *= scalar;
 278     out[4] *= scalar;
 279     out[5] *= scalar;
 280     out[6] *= scalar;
 281     out[7] *= scalar;
 282     out[8] *= scalar;
 283 }
 284
 285 /* felem_scalar128 sets out = out * scalar */
 286 static void felem_scalar128(largefelem out, limb scalar)
 287 {
 288     out[0] *= scalar;
 289     out[1] *= scalar;
 290     out[2] *= scalar;
 291     out[3] *= scalar;
 292     out[4] *= scalar;
 293     out[5] *= scalar;
 294     out[6] *= scalar;
 295     out[7] *= scalar;
 296     out[8] *= scalar;
 297 }
 298
 299 /*-
 300  * felem_neg sets |out| to |-in|
 301  * On entry:
 302  *   in[i] < 2^59 + 2^14
 303  * On exit:
 304  *   out[i] < 2^62
 305  */
 306 static void felem_neg(felem out, const felem in)
 307 {
 308     /* In order to prevent underflow, we subtract from 0 mod p. */
 309     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 310     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 311
 312     out[0] = two62m3 - in[0];
 313     out[1] = two62m2 - in[1];
 314     out[2] = two62m2 - in[2];
 315     out[3] = two62m2 - in[3];
 316     out[4] = two62m2 - in[4];
 317     out[5] = two62m2 - in[5];
 318     out[6] = two62m2 - in[6];
 319     out[7] = two62m2 - in[7];
 320     out[8] = two62m2 - in[8];
 321 }
 322
 323 /*-
 324  * felem_diff64 subtracts |in| from |out|
 325  * On entry:
 326  *   in[i] < 2^59 + 2^14
 327  * On exit:
 328  *   out[i] < out[i] + 2^62
 329  */
 330 static void felem_diff64(felem out, const felem in)
 331 {
 332     /*
 333      * In order to prevent underflow, we add 0 mod p before subtracting.
 334      */
 335     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 336     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 337
 338     out[0] += two62m3 - in[0];
 339     out[1] += two62m2 - in[1];
 340     out[2] += two62m2 - in[2];
 341     out[3] += two62m2 - in[3];
 342     out[4] += two62m2 - in[4];
 343     out[5] += two62m2 - in[5];
 344     out[6] += two62m2 - in[6];
 345     out[7] += two62m2 - in[7];
 346     out[8] += two62m2 - in[8];
 347 }
 348
 349 /*-
 350  * felem_diff_128_64 subtracts |in| from |out|
 351  * On entry:
 352  *   in[i] < 2^62 + 2^17
 353  * On exit:
 354  *   out[i] < out[i] + 2^63
 355  */
 356 static void felem_diff_128_64(largefelem out, const felem in)
 357 {
 358     /*
 359      * In order to prevent underflow, we add 0 mod p before subtracting.
 360      */
 361     static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
 362     static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
 363
 364     out[0] += two63m6 - in[0];
 365     out[1] += two63m5 - in[1];
 366     out[2] += two63m5 - in[2];
 367     out[3] += two63m5 - in[3];
 368     out[4] += two63m5 - in[4];
 369     out[5] += two63m5 - in[5];
 370     out[6] += two63m5 - in[6];
 371     out[7] += two63m5 - in[7];
 372     out[8] += two63m5 - in[8];
 373 }
 374
 375 /*-
 376  * felem_diff_128_64 subtracts |in| from |out|
 377  * On entry:
 378  *   in[i] < 2^126
 379  * On exit:
 380  *   out[i] < out[i] + 2^127 - 2^69
 381  */
 382 static void felem_diff128(largefelem out, const largefelem in)
 383 {
 384     /*
 385      * In order to prevent underflow, we add 0 mod p before subtracting.
 386      */
 387     static const uint128_t two127m70 =
 388         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 389     static const uint128_t two127m69 =
 390         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 391
 392     out[0] += (two127m70 - in[0]);
 393     out[1] += (two127m69 - in[1]);
 394     out[2] += (two127m69 - in[2]);
 395     out[3] += (two127m69 - in[3]);
 396     out[4] += (two127m69 - in[4]);
 397     out[5] += (two127m69 - in[5]);
 398     out[6] += (two127m69 - in[6]);
 399     out[7] += (two127m69 - in[7]);
 400     out[8] += (two127m69 - in[8]);
 401 }
 402
 403 /*-
 404  * felem_square sets |out| = |in|^2
 405  * On entry:
 406  *   in[i] < 2^62
 407  * On exit:
 408  *   out[i] < 17 * max(in[i]) * max(in[i])
 409  */
 410 static void felem_square(largefelem out, const felem in)
 411 {
 412     felem inx2, inx4;
 413     felem_scalar(inx2, in, 2);
 414     felem_scalar(inx4, in, 4);
 415
 416     /*-
 417      * We have many cases were we want to do
 418      *   in[x] * in[y] +
 419      *   in[y] * in[x]
 420      * This is obviously just
 421      *   2 * in[x] * in[y]
 422      * However, rather than do the doubling on the 128 bit result, we
 423      * double one of the inputs to the multiplication by reading from
 424      * |inx2|
 425      */
 426
 427     out[0] = ((uint128_t) in[0]) * in[0];
 428     out[1] = ((uint128_t) in[0]) * inx2[1];
 429     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 430     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 431     out[4] = ((uint128_t) in[0]) * inx2[4] +
 432         ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 433     out[5] = ((uint128_t) in[0]) * inx2[5] +
 434         ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 435     out[6] = ((uint128_t) in[0]) * inx2[6] +
 436         ((uint128_t) in[1]) * inx2[5] +
 437         ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 438     out[7] = ((uint128_t) in[0]) * inx2[7] +
 439         ((uint128_t) in[1]) * inx2[6] +
 440         ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 441     out[8] = ((uint128_t) in[0]) * inx2[8] +
 442         ((uint128_t) in[1]) * inx2[7] +
 443         ((uint128_t) in[2]) * inx2[6] +
 444         ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 445
 446     /*
 447      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 448      * They correspond to locations one bit up from the limbs produced above
 449      * so we would have to multiply by two to align them. Again, rather than
 450      * operate on the 128-bit result, we double one of the inputs to the
 451      * multiplication. If we want to double for both this reason, and the
 452      * reason above, then we end up multiplying by four.
 453      */
 454
 455     /* 9 */
 456     out[0] += ((uint128_t) in[1]) * inx4[8] +
 457         ((uint128_t) in[2]) * inx4[7] +
 458         ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 459
 460     /* 10 */
 461     out[1] += ((uint128_t) in[2]) * inx4[8] +
 462         ((uint128_t) in[3]) * inx4[7] +
 463         ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 464
 465     /* 11 */
 466     out[2] += ((uint128_t) in[3]) * inx4[8] +
 467         ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 468
 469     /* 12 */
 470     out[3] += ((uint128_t) in[4]) * inx4[8] +
 471         ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 472
 473     /* 13 */
 474     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 475
 476     /* 14 */
 477     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 478
 479     /* 15 */
 480     out[6] += ((uint128_t) in[7]) * inx4[8];
 481
 482     /* 16 */
 483     out[7] += ((uint128_t) in[8]) * inx2[8];
 484 }
 485
 486 /*-
 487  * felem_mul sets |out| = |in1| * |in2|
 488  * On entry:
 489  *   in1[i] < 2^64
 490  *   in2[i] < 2^63
 491  * On exit:
 492  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 493  */
 494 static void felem_mul(largefelem out, const felem in1, const felem in2)
 495 {
 496     felem in2x2;
 497     felem_scalar(in2x2, in2, 2);
 498
 499     out[0] = ((uint128_t) in1[0]) * in2[0];
 500
 501     out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
 502
 503     out[2] = ((uint128_t) in1[0]) * in2[2] +
 504         ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
 505
 506     out[3] = ((uint128_t) in1[0]) * in2[3] +
 507         ((uint128_t) in1[1]) * in2[2] +
 508         ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
 509
 510     out[4] = ((uint128_t) in1[0]) * in2[4] +
 511         ((uint128_t) in1[1]) * in2[3] +
 512         ((uint128_t) in1[2]) * in2[2] +
 513         ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
 514
 515     out[5] = ((uint128_t) in1[0]) * in2[5] +
 516         ((uint128_t) in1[1]) * in2[4] +
 517         ((uint128_t) in1[2]) * in2[3] +
 518         ((uint128_t) in1[3]) * in2[2] +
 519         ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
 520
 521     out[6] = ((uint128_t) in1[0]) * in2[6] +
 522         ((uint128_t) in1[1]) * in2[5] +
 523         ((uint128_t) in1[2]) * in2[4] +
 524         ((uint128_t) in1[3]) * in2[3] +
 525         ((uint128_t) in1[4]) * in2[2] +
 526         ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
 527
 528     out[7] = ((uint128_t) in1[0]) * in2[7] +
 529         ((uint128_t) in1[1]) * in2[6] +
 530         ((uint128_t) in1[2]) * in2[5] +
 531         ((uint128_t) in1[3]) * in2[4] +
 532         ((uint128_t) in1[4]) * in2[3] +
 533         ((uint128_t) in1[5]) * in2[2] +
 534         ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
 535
 536     out[8] = ((uint128_t) in1[0]) * in2[8] +
 537         ((uint128_t) in1[1]) * in2[7] +
 538         ((uint128_t) in1[2]) * in2[6] +
 539         ((uint128_t) in1[3]) * in2[5] +
 540         ((uint128_t) in1[4]) * in2[4] +
 541         ((uint128_t) in1[5]) * in2[3] +
 542         ((uint128_t) in1[6]) * in2[2] +
 543         ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
 544
 545     /* See comment in felem_square about the use of in2x2 here */
 546
 547     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 548         ((uint128_t) in1[2]) * in2x2[7] +
 549         ((uint128_t) in1[3]) * in2x2[6] +
 550         ((uint128_t) in1[4]) * in2x2[5] +
 551         ((uint128_t) in1[5]) * in2x2[4] +
 552         ((uint128_t) in1[6]) * in2x2[3] +
 553         ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
 554
 555     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 556         ((uint128_t) in1[3]) * in2x2[7] +
 557         ((uint128_t) in1[4]) * in2x2[6] +
 558         ((uint128_t) in1[5]) * in2x2[5] +
 559         ((uint128_t) in1[6]) * in2x2[4] +
 560         ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
 561
 562     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 563         ((uint128_t) in1[4]) * in2x2[7] +
 564         ((uint128_t) in1[5]) * in2x2[6] +
 565         ((uint128_t) in1[6]) * in2x2[5] +
 566         ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
 567
 568     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 569         ((uint128_t) in1[5]) * in2x2[7] +
 570         ((uint128_t) in1[6]) * in2x2[6] +
 571         ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
 572
 573     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 574         ((uint128_t) in1[6]) * in2x2[7] +
 575         ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
 576
 577     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 578         ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
 579
 580     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 581         ((uint128_t) in1[8]) * in2x2[7];
 582
 583     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 584 }
 585
 586 static const limb bottom52bits = 0xfffffffffffff;
 587
 588 /*-
 589  * felem_reduce converts a largefelem to an felem.
 590  * On entry:
 591  *   in[i] < 2^128
 592  * On exit:
 593  *   out[i] < 2^59 + 2^14
 594  */
 595 static void felem_reduce(felem out, const largefelem in)
 596 {
 597     u64 overflow1, overflow2;
 598
 599     out[0] = ((limb) in[0]) & bottom58bits;
 600     out[1] = ((limb) in[1]) & bottom58bits;
 601     out[2] = ((limb) in[2]) & bottom58bits;
 602     out[3] = ((limb) in[3]) & bottom58bits;
 603     out[4] = ((limb) in[4]) & bottom58bits;
 604     out[5] = ((limb) in[5]) & bottom58bits;
 605     out[6] = ((limb) in[6]) & bottom58bits;
 606     out[7] = ((limb) in[7]) & bottom58bits;
 607     out[8] = ((limb) in[8]) & bottom58bits;
 608
 609     /* out[i] < 2^58 */
 610
 611     out[1] += ((limb) in[0]) >> 58;
 612     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 613     /*-
 614      * out[1] < 2^58 + 2^6 + 2^58
 615      *        = 2^59 + 2^6
 616      */
 617     out[2] += ((limb) (in[0] >> 64)) >> 52;
 618
 619     out[2] += ((limb) in[1]) >> 58;
 620     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 621     out[3] += ((limb) (in[1] >> 64)) >> 52;
 622
 623     out[3] += ((limb) in[2]) >> 58;
 624     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 625     out[4] += ((limb) (in[2] >> 64)) >> 52;
 626
 627     out[4] += ((limb) in[3]) >> 58;
 628     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 629     out[5] += ((limb) (in[3] >> 64)) >> 52;
 630
 631     out[5] += ((limb) in[4]) >> 58;
 632     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 633     out[6] += ((limb) (in[4] >> 64)) >> 52;
 634
 635     out[6] += ((limb) in[5]) >> 58;
 636     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 637     out[7] += ((limb) (in[5] >> 64)) >> 52;
 638
 639     out[7] += ((limb) in[6]) >> 58;
 640     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 641     out[8] += ((limb) (in[6] >> 64)) >> 52;
 642
 643     out[8] += ((limb) in[7]) >> 58;
 644     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 645     /*-
 646      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 647      *            < 2^59 + 2^13
 648      */
 649     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 650
 651     overflow1 += ((limb) in[8]) >> 58;
 652     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 653     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 654
 655     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 656     overflow2 <<= 1;            /* overflow2 < 2^13 */
 657
 658     out[0] += overflow1;        /* out[0] < 2^60 */
 659     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 660
 661     out[1] += out[0] >> 58;
 662     out[0] &= bottom58bits;
 663     /*-
 664      * out[0] < 2^58
 665      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 666      *        < 2^59 + 2^14
 667      */
 668 }
 669
 670 static void felem_square_reduce(felem out, const felem in)
 671 {
 672     largefelem tmp;
 673     felem_square(tmp, in);
 674     felem_reduce(out, tmp);
 675 }
 676
 677 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 678 {
 679     largefelem tmp;
 680     felem_mul(tmp, in1, in2);
 681     felem_reduce(out, tmp);
 682 }
 683
 684 /*-
 685  * felem_inv calculates |out| = |in|^{-1}
 686  *
 687  * Based on Fermat's Little Theorem:
 688  *   a^p = a (mod p)
 689  *   a^{p-1} = 1 (mod p)
 690  *   a^{p-2} = a^{-1} (mod p)
 691  */
 692 static void felem_inv(felem out, const felem in)
 693 {
 694     felem ftmp, ftmp2, ftmp3, ftmp4;
 695     largefelem tmp;
 696     unsigned i;
 697
 698     felem_square(tmp, in);
 699     felem_reduce(ftmp, tmp);    /* 2^1 */
 700     felem_mul(tmp, in, ftmp);
 701     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 702     felem_assign(ftmp2, ftmp);
 703     felem_square(tmp, ftmp);
 704     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 705     felem_mul(tmp, in, ftmp);
 706     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 707     felem_square(tmp, ftmp);
 708     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 709
 710     felem_square(tmp, ftmp2);
 711     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 712     felem_square(tmp, ftmp3);
 713     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 714     felem_mul(tmp, ftmp3, ftmp2);
 715     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 716
 717     felem_assign(ftmp2, ftmp3);
 718     felem_square(tmp, ftmp3);
 719     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 720     felem_square(tmp, ftmp3);
 721     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 722     felem_square(tmp, ftmp3);
 723     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 724     felem_square(tmp, ftmp3);
 725     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 726     felem_assign(ftmp4, ftmp3);
 727     felem_mul(tmp, ftmp3, ftmp);
 728     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 729     felem_square(tmp, ftmp4);
 730     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 731     felem_mul(tmp, ftmp3, ftmp2);
 732     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 733     felem_assign(ftmp2, ftmp3);
 734
 735     for (i = 0; i < 8; i++) {
 736         felem_square(tmp, ftmp3);
 737         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 738     }
 739     felem_mul(tmp, ftmp3, ftmp2);
 740     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 741     felem_assign(ftmp2, ftmp3);
 742
 743     for (i = 0; i < 16; i++) {
 744         felem_square(tmp, ftmp3);
 745         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 746     }
 747     felem_mul(tmp, ftmp3, ftmp2);
 748     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 749     felem_assign(ftmp2, ftmp3);
 750
 751     for (i = 0; i < 32; i++) {
 752         felem_square(tmp, ftmp3);
 753         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 754     }
 755     felem_mul(tmp, ftmp3, ftmp2);
 756     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 757     felem_assign(ftmp2, ftmp3);
 758
 759     for (i = 0; i < 64; i++) {
 760         felem_square(tmp, ftmp3);
 761         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 762     }
 763     felem_mul(tmp, ftmp3, ftmp2);
 764     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 765     felem_assign(ftmp2, ftmp3);
 766
 767     for (i = 0; i < 128; i++) {
 768         felem_square(tmp, ftmp3);
 769         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 770     }
 771     felem_mul(tmp, ftmp3, ftmp2);
 772     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 773     felem_assign(ftmp2, ftmp3);
 774
 775     for (i = 0; i < 256; i++) {
 776         felem_square(tmp, ftmp3);
 777         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 778     }
 779     felem_mul(tmp, ftmp3, ftmp2);
 780     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 781
 782     for (i = 0; i < 9; i++) {
 783         felem_square(tmp, ftmp3);
 784         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 785     }
 786     felem_mul(tmp, ftmp3, ftmp4);
 787     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 788     felem_mul(tmp, ftmp3, in);
 789     felem_reduce(out, tmp);     /* 2^512 - 3 */
 790 }
 791
 792 /* This is 2^521-1, expressed as an felem */
 793 static const felem kPrime = {
 794     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 795     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 796     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 797 };
 798
 799 /*-
 800  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 801  * otherwise.
 802  * On entry:
 803  *   in[i] < 2^59 + 2^14
 804  */
 805 static limb felem_is_zero(const felem in)
 806 {
 807     felem ftmp;
 808     limb is_zero, is_p;
 809     felem_assign(ftmp, in);
 810
 811     ftmp[0] += ftmp[8] >> 57;
 812     ftmp[8] &= bottom57bits;
 813     /* ftmp[8] < 2^57 */
 814     ftmp[1] += ftmp[0] >> 58;
 815     ftmp[0] &= bottom58bits;
 816     ftmp[2] += ftmp[1] >> 58;
 817     ftmp[1] &= bottom58bits;
 818     ftmp[3] += ftmp[2] >> 58;
 819     ftmp[2] &= bottom58bits;
 820     ftmp[4] += ftmp[3] >> 58;
 821     ftmp[3] &= bottom58bits;
 822     ftmp[5] += ftmp[4] >> 58;
 823     ftmp[4] &= bottom58bits;
 824     ftmp[6] += ftmp[5] >> 58;
 825     ftmp[5] &= bottom58bits;
 826     ftmp[7] += ftmp[6] >> 58;
 827     ftmp[6] &= bottom58bits;
 828     ftmp[8] += ftmp[7] >> 58;
 829     ftmp[7] &= bottom58bits;
 830     /* ftmp[8] < 2^57 + 4 */
 831
 832     /*
 833      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 834      * than our bound for ftmp[8]. Therefore we only have to check if the
 835      * zero is zero or 2^521-1.
 836      */
 837
 838     is_zero = 0;
 839     is_zero |= ftmp[0];
 840     is_zero |= ftmp[1];
 841     is_zero |= ftmp[2];
 842     is_zero |= ftmp[3];
 843     is_zero |= ftmp[4];
 844     is_zero |= ftmp[5];
 845     is_zero |= ftmp[6];
 846     is_zero |= ftmp[7];
 847     is_zero |= ftmp[8];
 848
 849     is_zero--;
 850     /*
 851      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 852      * can be set is if is_zero was 0 before the decrement.
 853      */
 854     is_zero = 0 - (is_zero >> 63);
 855
 856     is_p = ftmp[0] ^ kPrime[0];
 857     is_p |= ftmp[1] ^ kPrime[1];
 858     is_p |= ftmp[2] ^ kPrime[2];
 859     is_p |= ftmp[3] ^ kPrime[3];
 860     is_p |= ftmp[4] ^ kPrime[4];
 861     is_p |= ftmp[5] ^ kPrime[5];
 862     is_p |= ftmp[6] ^ kPrime[6];
 863     is_p |= ftmp[7] ^ kPrime[7];
 864     is_p |= ftmp[8] ^ kPrime[8];
 865
 866     is_p--;
 867     is_p = 0 - (is_p >> 63);
 868
 869     is_zero |= is_p;
 870     return is_zero;
 871 }
 872
 873 static int felem_is_zero_int(const void *in)
 874 {
 875     return (int)(felem_is_zero(in) & ((limb) 1));
 876 }
 877
 878 /*-
 879  * felem_contract converts |in| to its unique, minimal representation.
 880  * On entry:
 881  *   in[i] < 2^59 + 2^14
 882  */
 883 static void felem_contract(felem out, const felem in)
 884 {
 885     limb is_p, is_greater, sign;
 886     static const limb two58 = ((limb) 1) << 58;
 887
 888     felem_assign(out, in);
 889
 890     out[0] += out[8] >> 57;
 891     out[8] &= bottom57bits;
 892     /* out[8] < 2^57 */
 893     out[1] += out[0] >> 58;
 894     out[0] &= bottom58bits;
 895     out[2] += out[1] >> 58;
 896     out[1] &= bottom58bits;
 897     out[3] += out[2] >> 58;
 898     out[2] &= bottom58bits;
 899     out[4] += out[3] >> 58;
 900     out[3] &= bottom58bits;
 901     out[5] += out[4] >> 58;
 902     out[4] &= bottom58bits;
 903     out[6] += out[5] >> 58;
 904     out[5] &= bottom58bits;
 905     out[7] += out[6] >> 58;
 906     out[6] &= bottom58bits;
 907     out[8] += out[7] >> 58;
 908     out[7] &= bottom58bits;
 909     /* out[8] < 2^57 + 4 */
 910
 911     /*
 912      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 913      * out. See the comments in felem_is_zero regarding why we don't test for
 914      * other multiples of the prime.
 915      */
 916
 917     /*
 918      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 919      */
 920
 921     is_p = out[0] ^ kPrime[0];
 922     is_p |= out[1] ^ kPrime[1];
 923     is_p |= out[2] ^ kPrime[2];
 924     is_p |= out[3] ^ kPrime[3];
 925     is_p |= out[4] ^ kPrime[4];
 926     is_p |= out[5] ^ kPrime[5];
 927     is_p |= out[6] ^ kPrime[6];
 928     is_p |= out[7] ^ kPrime[7];
 929     is_p |= out[8] ^ kPrime[8];
 930
 931     is_p--;
 932     is_p &= is_p << 32;
 933     is_p &= is_p << 16;
 934     is_p &= is_p << 8;
 935     is_p &= is_p << 4;
 936     is_p &= is_p << 2;
 937     is_p &= is_p << 1;
 938     is_p = 0 - (is_p >> 63);
 939     is_p = ~is_p;
 940
 941     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 942
 943     out[0] &= is_p;
 944     out[1] &= is_p;
 945     out[2] &= is_p;
 946     out[3] &= is_p;
 947     out[4] &= is_p;
 948     out[5] &= is_p;
 949     out[6] &= is_p;
 950     out[7] &= is_p;
 951     out[8] &= is_p;
 952
 953     /*
 954      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 955      * 57 is greater than zero as (2^521-1) + x >= 2^522
 956      */
 957     is_greater = out[8] >> 57;
 958     is_greater |= is_greater << 32;
 959     is_greater |= is_greater << 16;
 960     is_greater |= is_greater << 8;
 961     is_greater |= is_greater << 4;
 962     is_greater |= is_greater << 2;
 963     is_greater |= is_greater << 1;
 964     is_greater = 0 - (is_greater >> 63);
 965
 966     out[0] -= kPrime[0] & is_greater;
 967     out[1] -= kPrime[1] & is_greater;
 968     out[2] -= kPrime[2] & is_greater;
 969     out[3] -= kPrime[3] & is_greater;
 970     out[4] -= kPrime[4] & is_greater;
 971     out[5] -= kPrime[5] & is_greater;
 972     out[6] -= kPrime[6] & is_greater;
 973     out[7] -= kPrime[7] & is_greater;
 974     out[8] -= kPrime[8] & is_greater;
 975
 976     /* Eliminate negative coefficients */
 977     sign = -(out[0] >> 63);
 978     out[0] += (two58 & sign);
 979     out[1] -= (1 & sign);
 980     sign = -(out[1] >> 63);
 981     out[1] += (two58 & sign);
 982     out[2] -= (1 & sign);
 983     sign = -(out[2] >> 63);
 984     out[2] += (two58 & sign);
 985     out[3] -= (1 & sign);
 986     sign = -(out[3] >> 63);
 987     out[3] += (two58 & sign);
 988     out[4] -= (1 & sign);
 989     sign = -(out[4] >> 63);
 990     out[4] += (two58 & sign);
 991     out[5] -= (1 & sign);
 992     sign = -(out[0] >> 63);
 993     out[5] += (two58 & sign);
 994     out[6] -= (1 & sign);
 995     sign = -(out[6] >> 63);
 996     out[6] += (two58 & sign);
 997     out[7] -= (1 & sign);
 998     sign = -(out[7] >> 63);
 999     out[7] += (two58 & sign);
1000     out[8] -= (1 & sign);
1001     sign = -(out[5] >> 63);
1002     out[5] += (two58 & sign);
1003     out[6] -= (1 & sign);
1004     sign = -(out[6] >> 63);
1005     out[6] += (two58 & sign);
1006     out[7] -= (1 & sign);
1007     sign = -(out[7] >> 63);
1008     out[7] += (two58 & sign);
1009     out[8] -= (1 & sign);
1010 }
1011
1012 /*-
1013  * Group operations
1014  * ----------------
1015  *
1016  * Building on top of the field operations we have the operations on the
1017  * elliptic curve group itself. Points on the curve are represented in Jacobian
1018  * coordinates */
1019
1020 /*-
1021  * point_double calcuates 2*(x_in, y_in, z_in)
1022  *
1023  * The method is taken from:
1024  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1025  *
1026  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1027  * while x_out == y_in is not (maybe this works, but it's not tested). */
1028 static void
1029 point_double(felem x_out, felem y_out, felem z_out,
1030              const felem x_in, const felem y_in, const felem z_in)
1031 {
1032     largefelem tmp, tmp2;
1033     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1034
1035     felem_assign(ftmp, x_in);
1036     felem_assign(ftmp2, x_in);
1037
1038     /* delta = z^2 */
1039     felem_square(tmp, z_in);
1040     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1041
1042     /* gamma = y^2 */
1043     felem_square(tmp, y_in);
1044     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1045
1046     /* beta = x*gamma */
1047     felem_mul(tmp, x_in, gamma);
1048     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1049
1050     /* alpha = 3*(x-delta)*(x+delta) */
1051     felem_diff64(ftmp, delta);
1052     /* ftmp[i] < 2^61 */
1053     felem_sum64(ftmp2, delta);
1054     /* ftmp2[i] < 2^60 + 2^15 */
1055     felem_scalar64(ftmp2, 3);
1056     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1057     felem_mul(tmp, ftmp, ftmp2);
1058     /*-
1059      * tmp[i] < 17(3*2^121 + 3*2^76)
1060      *        = 61*2^121 + 61*2^76
1061      *        < 64*2^121 + 64*2^76
1062      *        = 2^127 + 2^82
1063      *        < 2^128
1064      */
1065     felem_reduce(alpha, tmp);
1066
1067     /* x' = alpha^2 - 8*beta */
1068     felem_square(tmp, alpha);
1069     /*
1070      * tmp[i] < 17*2^120 < 2^125
1071      */
1072     felem_assign(ftmp, beta);
1073     felem_scalar64(ftmp, 8);
1074     /* ftmp[i] < 2^62 + 2^17 */
1075     felem_diff_128_64(tmp, ftmp);
1076     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1077     felem_reduce(x_out, tmp);
1078
1079     /* z' = (y + z)^2 - gamma - delta */
1080     felem_sum64(delta, gamma);
1081     /* delta[i] < 2^60 + 2^15 */
1082     felem_assign(ftmp, y_in);
1083     felem_sum64(ftmp, z_in);
1084     /* ftmp[i] < 2^60 + 2^15 */
1085     felem_square(tmp, ftmp);
1086     /*
1087      * tmp[i] < 17(2^122) < 2^127
1088      */
1089     felem_diff_128_64(tmp, delta);
1090     /* tmp[i] < 2^127 + 2^63 */
1091     felem_reduce(z_out, tmp);
1092
1093     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1094     felem_scalar64(beta, 4);
1095     /* beta[i] < 2^61 + 2^16 */
1096     felem_diff64(beta, x_out);
1097     /* beta[i] < 2^61 + 2^60 + 2^16 */
1098     felem_mul(tmp, alpha, beta);
1099     /*-
1100      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1101      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1102      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1103      *        < 2^128
1104      */
1105     felem_square(tmp2, gamma);
1106     /*-
1107      * tmp2[i] < 17*(2^59 + 2^14)^2
1108      *         = 17*(2^118 + 2^74 + 2^28)
1109      */
1110     felem_scalar128(tmp2, 8);
1111     /*-
1112      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1113      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1114      *         < 2^126
1115      */
1116     felem_diff128(tmp, tmp2);
1117     /*-
1118      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1119      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1120      *          2^74 + 2^69 + 2^34 + 2^30
1121      *        < 2^128
1122      */
1123     felem_reduce(y_out, tmp);
1124 }
1125
1126 /* copy_conditional copies in to out iff mask is all ones. */
1127 static void copy_conditional(felem out, const felem in, limb mask)
1128 {
1129     unsigned i;
1130     for (i = 0; i < NLIMBS; ++i) {
1131         const limb tmp = mask & (in[i] ^ out[i]);
1132         out[i] ^= tmp;
1133     }
1134 }
1135
1136 /*-
1137  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1138  *
1139  * The method is taken from
1140  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1141  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1142  *
1143  * This function includes a branch for checking whether the two input points
1144  * are equal (while not equal to the point at infinity). This case never
1145  * happens during single point multiplication, so there is no timing leak for
1146  * ECDH or ECDSA signing. */
1147 static void point_add(felem x3, felem y3, felem z3,
1148                       const felem x1, const felem y1, const felem z1,
1149                       const int mixed, const felem x2, const felem y2,
1150                       const felem z2)
1151 {
1152     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1153     largefelem tmp, tmp2;
1154     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1155
1156     z1_is_zero = felem_is_zero(z1);
1157     z2_is_zero = felem_is_zero(z2);
1158
1159     /* ftmp = z1z1 = z1**2 */
1160     felem_square(tmp, z1);
1161     felem_reduce(ftmp, tmp);
1162
1163     if (!mixed) {
1164         /* ftmp2 = z2z2 = z2**2 */
1165         felem_square(tmp, z2);
1166         felem_reduce(ftmp2, tmp);
1167
1168         /* u1 = ftmp3 = x1*z2z2 */
1169         felem_mul(tmp, x1, ftmp2);
1170         felem_reduce(ftmp3, tmp);
1171
1172         /* ftmp5 = z1 + z2 */
1173         felem_assign(ftmp5, z1);
1174         felem_sum64(ftmp5, z2);
1175         /* ftmp5[i] < 2^61 */
1176
1177         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1178         felem_square(tmp, ftmp5);
1179         /* tmp[i] < 17*2^122 */
1180         felem_diff_128_64(tmp, ftmp);
1181         /* tmp[i] < 17*2^122 + 2^63 */
1182         felem_diff_128_64(tmp, ftmp2);
1183         /* tmp[i] < 17*2^122 + 2^64 */
1184         felem_reduce(ftmp5, tmp);
1185
1186         /* ftmp2 = z2 * z2z2 */
1187         felem_mul(tmp, ftmp2, z2);
1188         felem_reduce(ftmp2, tmp);
1189
1190         /* s1 = ftmp6 = y1 * z2**3 */
1191         felem_mul(tmp, y1, ftmp2);
1192         felem_reduce(ftmp6, tmp);
1193     } else {
1194         /*
1195          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1196          */
1197
1198         /* u1 = ftmp3 = x1*z2z2 */
1199         felem_assign(ftmp3, x1);
1200
1201         /* ftmp5 = 2*z1z2 */
1202         felem_scalar(ftmp5, z1, 2);
1203
1204         /* s1 = ftmp6 = y1 * z2**3 */
1205         felem_assign(ftmp6, y1);
1206     }
1207
1208     /* u2 = x2*z1z1 */
1209     felem_mul(tmp, x2, ftmp);
1210     /* tmp[i] < 17*2^120 */
1211
1212     /* h = ftmp4 = u2 - u1 */
1213     felem_diff_128_64(tmp, ftmp3);
1214     /* tmp[i] < 17*2^120 + 2^63 */
1215     felem_reduce(ftmp4, tmp);
1216
1217     x_equal = felem_is_zero(ftmp4);
1218
1219     /* z_out = ftmp5 * h */
1220     felem_mul(tmp, ftmp5, ftmp4);
1221     felem_reduce(z_out, tmp);
1222
1223     /* ftmp = z1 * z1z1 */
1224     felem_mul(tmp, ftmp, z1);
1225     felem_reduce(ftmp, tmp);
1226
1227     /* s2 = tmp = y2 * z1**3 */
1228     felem_mul(tmp, y2, ftmp);
1229     /* tmp[i] < 17*2^120 */
1230
1231     /* r = ftmp5 = (s2 - s1)*2 */
1232     felem_diff_128_64(tmp, ftmp6);
1233     /* tmp[i] < 17*2^120 + 2^63 */
1234     felem_reduce(ftmp5, tmp);
1235     y_equal = felem_is_zero(ftmp5);
1236     felem_scalar64(ftmp5, 2);
1237     /* ftmp5[i] < 2^61 */
1238
1239     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1240         point_double(x3, y3, z3, x1, y1, z1);
1241         return;
1242     }
1243
1244     /* I = ftmp = (2h)**2 */
1245     felem_assign(ftmp, ftmp4);
1246     felem_scalar64(ftmp, 2);
1247     /* ftmp[i] < 2^61 */
1248     felem_square(tmp, ftmp);
1249     /* tmp[i] < 17*2^122 */
1250     felem_reduce(ftmp, tmp);
1251
1252     /* J = ftmp2 = h * I */
1253     felem_mul(tmp, ftmp4, ftmp);
1254     felem_reduce(ftmp2, tmp);
1255
1256     /* V = ftmp4 = U1 * I */
1257     felem_mul(tmp, ftmp3, ftmp);
1258     felem_reduce(ftmp4, tmp);
1259
1260     /* x_out = r**2 - J - 2V */
1261     felem_square(tmp, ftmp5);
1262     /* tmp[i] < 17*2^122 */
1263     felem_diff_128_64(tmp, ftmp2);
1264     /* tmp[i] < 17*2^122 + 2^63 */
1265     felem_assign(ftmp3, ftmp4);
1266     felem_scalar64(ftmp4, 2);
1267     /* ftmp4[i] < 2^61 */
1268     felem_diff_128_64(tmp, ftmp4);
1269     /* tmp[i] < 17*2^122 + 2^64 */
1270     felem_reduce(x_out, tmp);
1271
1272     /* y_out = r(V-x_out) - 2 * s1 * J */
1273     felem_diff64(ftmp3, x_out);
1274     /*
1275      * ftmp3[i] < 2^60 + 2^60 = 2^61
1276      */
1277     felem_mul(tmp, ftmp5, ftmp3);
1278     /* tmp[i] < 17*2^122 */
1279     felem_mul(tmp2, ftmp6, ftmp2);
1280     /* tmp2[i] < 17*2^120 */
1281     felem_scalar128(tmp2, 2);
1282     /* tmp2[i] < 17*2^121 */
1283     felem_diff128(tmp, tmp2);
1284         /*-
1285          * tmp[i] < 2^127 - 2^69 + 17*2^122
1286          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1287          *        < 2^127
1288          */
1289     felem_reduce(y_out, tmp);
1290
1291     copy_conditional(x_out, x2, z1_is_zero);
1292     copy_conditional(x_out, x1, z2_is_zero);
1293     copy_conditional(y_out, y2, z1_is_zero);
1294     copy_conditional(y_out, y1, z2_is_zero);
1295     copy_conditional(z_out, z2, z1_is_zero);
1296     copy_conditional(z_out, z1, z2_is_zero);
1297     felem_assign(x3, x_out);
1298     felem_assign(y3, y_out);
1299     felem_assign(z3, z_out);
1300 }
1301
1302 /*-
1303  * Base point pre computation
1304  * --------------------------
1305  *
1306  * Two different sorts of precomputed tables are used in the following code.
1307  * Each contain various points on the curve, where each point is three field
1308  * elements (x, y, z).
1309  *
1310  * For the base point table, z is usually 1 (0 for the point at infinity).
1311  * This table has 16 elements:
1312  * index | bits    | point
1313  * ------+---------+------------------------------
1314  *     0 | 0 0 0 0 | 0G
1315  *     1 | 0 0 0 1 | 1G
1316  *     2 | 0 0 1 0 | 2^130G
1317  *     3 | 0 0 1 1 | (2^130 + 1)G
1318  *     4 | 0 1 0 0 | 2^260G
1319  *     5 | 0 1 0 1 | (2^260 + 1)G
1320  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1321  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1322  *     8 | 1 0 0 0 | 2^390G
1323  *     9 | 1 0 0 1 | (2^390 + 1)G
1324  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1325  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1326  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1327  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1328  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1329  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1330  *
1331  * The reason for this is so that we can clock bits into four different
1332  * locations when doing simple scalar multiplies against the base point.
1333  *
1334  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1335
1336 /* gmul is the table of precomputed base points */
1337 static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1338                                     {0, 0, 0, 0, 0, 0, 0, 0, 0},
1339                                     {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1340 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1341   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1342   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1343  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1344   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1345   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1346  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1347 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1348   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1349   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1350  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1351   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1352   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1353  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1354 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1355   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1356   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1357  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1358   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1359   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1360  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1361 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1362   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1363   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1364  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1365   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1366   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1367  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1368 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1369   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1370   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1371  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1372   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1373   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1374  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1375 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1376   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1377   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1378  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1379   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1380   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1381  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1382 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1383   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1384   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1385  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1386   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1387   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1388  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1389 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1390   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1391   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1392  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1393   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1394   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1395  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1396 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1397   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1398   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1399  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1400   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1401   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1402  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1403 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1404   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1405   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1406  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1407   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1408   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1409  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1410 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1411   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1412   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1413  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1414   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1415   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1416  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1417 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1418   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1419   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1420  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1421   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1422   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1423  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1424 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1425   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1426   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1427  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1428   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1429   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1430  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1431 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1432   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1433   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1434  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1435   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1436   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1437  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1438 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1439   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1440   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1441  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1442   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1443   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1444  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1445 };
1446
1447 /*
1448  * select_point selects the |idx|th point from a precomputation table and
1449  * copies it to out.
1450  */
1451  /* pre_comp below is of the size provided in |size| */
1452 static void select_point(const limb idx, unsigned int size,
1453                          const felem pre_comp[][3], felem out[3])
1454 {
1455     unsigned i, j;
1456     limb *outlimbs = &out[0][0];
1457     memset(outlimbs, 0, 3 * sizeof(felem));
1458
1459     for (i = 0; i < size; i++) {
1460         const limb *inlimbs = &pre_comp[i][0][0];
1461         limb mask = i ^ idx;
1462         mask |= mask >> 4;
1463         mask |= mask >> 2;
1464         mask |= mask >> 1;
1465         mask &= 1;
1466         mask--;
1467         for (j = 0; j < NLIMBS * 3; j++)
1468             outlimbs[j] |= inlimbs[j] & mask;
1469     }
1470 }
1471
1472 /* get_bit returns the |i|th bit in |in| */
1473 static char get_bit(const felem_bytearray in, int i)
1474 {
1475     if (i < 0)
1476         return 0;
1477     return (in[i >> 3] >> (i & 7)) & 1;
1478 }
1479
1480 /*
1481  * Interleaved point multiplication using precomputed point multiples: The
1482  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1483  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1484  * generator, using certain (large) precomputed multiples in g_pre_comp.
1485  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1486  */
1487 static void batch_mul(felem x_out, felem y_out, felem z_out,
1488                       const felem_bytearray scalars[],
1489                       const unsigned num_points, const u8 *g_scalar,
1490                       const int mixed, const felem pre_comp[][17][3],
1491                       const felem g_pre_comp[16][3])
1492 {
1493     int i, skip;
1494     unsigned num, gen_mul = (g_scalar != NULL);
1495     felem nq[3], tmp[4];
1496     limb bits;
1497     u8 sign, digit;
1498
1499     /* set nq to the point at infinity */
1500     memset(nq, 0, 3 * sizeof(felem));
1501
1502     /*
1503      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1504      * of the generator (last quarter of rounds) and additions of other
1505      * points multiples (every 5th round).
1506      */
1507     skip = 1;                   /* save two point operations in the first
1508                                  * round */
1509     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1510         /* double */
1511         if (!skip)
1512             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1513
1514         /* add multiples of the generator */
1515         if (gen_mul && (i <= 130)) {
1516             bits = get_bit(g_scalar, i + 390) << 3;
1517             if (i < 130) {
1518                 bits |= get_bit(g_scalar, i + 260) << 2;
1519                 bits |= get_bit(g_scalar, i + 130) << 1;
1520                 bits |= get_bit(g_scalar, i);
1521             }
1522             /* select the point to add, in constant time */
1523             select_point(bits, 16, g_pre_comp, tmp);
1524             if (!skip) {
1525                 /* The 1 argument below is for "mixed" */
1526                 point_add(nq[0], nq[1], nq[2],
1527                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1528             } else {
1529                 memcpy(nq, tmp, 3 * sizeof(felem));
1530                 skip = 0;
1531             }
1532         }
1533
1534         /* do other additions every 5 doublings */
1535         if (num_points && (i % 5 == 0)) {
1536             /* loop over all scalars */
1537             for (num = 0; num < num_points; ++num) {
1538                 bits = get_bit(scalars[num], i + 4) << 5;
1539                 bits |= get_bit(scalars[num], i + 3) << 4;
1540                 bits |= get_bit(scalars[num], i + 2) << 3;
1541                 bits |= get_bit(scalars[num], i + 1) << 2;
1542                 bits |= get_bit(scalars[num], i) << 1;
1543                 bits |= get_bit(scalars[num], i - 1);
1544                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1545
1546                 /*
1547                  * select the point to add or subtract, in constant time
1548                  */
1549                 select_point(digit, 17, pre_comp[num], tmp);
1550                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1551                                             * point */
1552                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1553
1554                 if (!skip) {
1555                     point_add(nq[0], nq[1], nq[2],
1556                               nq[0], nq[1], nq[2],
1557                               mixed, tmp[0], tmp[1], tmp[2]);
1558                 } else {
1559                     memcpy(nq, tmp, 3 * sizeof(felem));
1560                     skip = 0;
1561                 }
1562             }
1563         }
1564     }
1565     felem_assign(x_out, nq[0]);
1566     felem_assign(y_out, nq[1]);
1567     felem_assign(z_out, nq[2]);
1568 }
1569
1570 /* Precomputation for the group generator. */
1571 typedef struct {
1572     felem g_pre_comp[16][3];
1573     int references;
1574 } NISTP521_PRE_COMP;
1575
1576 const EC_METHOD *EC_GFp_nistp521_method(void)
1577 {
1578     static const EC_METHOD ret = {
1579         EC_FLAGS_DEFAULT_OCT,
1580         NID_X9_62_prime_field,
1581         ec_GFp_nistp521_group_init,
1582         ec_GFp_simple_group_finish,
1583         ec_GFp_simple_group_clear_finish,
1584         ec_GFp_nist_group_copy,
1585         ec_GFp_nistp521_group_set_curve,
1586         ec_GFp_simple_group_get_curve,
1587         ec_GFp_simple_group_get_degree,
1588         ec_GFp_simple_group_check_discriminant,
1589         ec_GFp_simple_point_init,
1590         ec_GFp_simple_point_finish,
1591         ec_GFp_simple_point_clear_finish,
1592         ec_GFp_simple_point_copy,
1593         ec_GFp_simple_point_set_to_infinity,
1594         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1595         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1596         ec_GFp_simple_point_set_affine_coordinates,
1597         ec_GFp_nistp521_point_get_affine_coordinates,
1598         0 /* point_set_compressed_coordinates */ ,
1599         0 /* point2oct */ ,
1600         0 /* oct2point */ ,
1601         ec_GFp_simple_add,
1602         ec_GFp_simple_dbl,
1603         ec_GFp_simple_invert,
1604         ec_GFp_simple_is_at_infinity,
1605         ec_GFp_simple_is_on_curve,
1606         ec_GFp_simple_cmp,
1607         ec_GFp_simple_make_affine,
1608         ec_GFp_simple_points_make_affine,
1609         ec_GFp_nistp521_points_mul,
1610         ec_GFp_nistp521_precompute_mult,
1611         ec_GFp_nistp521_have_precompute_mult,
1612         ec_GFp_nist_field_mul,
1613         ec_GFp_nist_field_sqr,
1614         0 /* field_div */ ,
1615         0 /* field_encode */ ,
1616         0 /* field_decode */ ,
1617         0                       /* field_set_to_one */
1618     };
1619
1620     return &ret;
1621 }
1622
1623 /******************************************************************************/
1624 /*
1625  * FUNCTIONS TO MANAGE PRECOMPUTATION
1626  */
1627
1628 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1629 {
1630     NISTP521_PRE_COMP *ret = NULL;
1631     ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1632     if (!ret) {
1633         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1634         return ret;
1635     }
1636     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1637     ret->references = 1;
1638     return ret;
1639 }
1640
1641 static void *nistp521_pre_comp_dup(void *src_)
1642 {
1643     NISTP521_PRE_COMP *src = src_;
1644
1645     /* no need to actually copy, these objects never change! */
1646     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1647
1648     return src_;
1649 }
1650
1651 static void nistp521_pre_comp_free(void *pre_)
1652 {
1653     int i;
1654     NISTP521_PRE_COMP *pre = pre_;
1655
1656     if (!pre)
1657         return;
1658
1659     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1660     if (i > 0)
1661         return;
1662
1663     OPENSSL_free(pre);
1664 }
1665
1666 static void nistp521_pre_comp_clear_free(void *pre_)
1667 {
1668     int i;
1669     NISTP521_PRE_COMP *pre = pre_;
1670
1671     if (!pre)
1672         return;
1673
1674     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1675     if (i > 0)
1676         return;
1677
1678     OPENSSL_cleanse(pre, sizeof(*pre));
1679     OPENSSL_free(pre);
1680 }
1681
1682 /******************************************************************************/
1683 /*
1684  * OPENSSL EC_METHOD FUNCTIONS
1685  */
1686
1687 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1688 {
1689     int ret;
1690     ret = ec_GFp_simple_group_init(group);
1691     group->a_is_minus3 = 1;
1692     return ret;
1693 }
1694
1695 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1696                                     const BIGNUM *a, const BIGNUM *b,
1697                                     BN_CTX *ctx)
1698 {
1699     int ret = 0;
1700     BN_CTX *new_ctx = NULL;
1701     BIGNUM *curve_p, *curve_a, *curve_b;
1702
1703     if (ctx == NULL)
1704         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1705             return 0;
1706     BN_CTX_start(ctx);
1707     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1708         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1709         ((curve_b = BN_CTX_get(ctx)) == NULL))
1710         goto err;
1711     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1712     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1713     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1714     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1715         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1716               EC_R_WRONG_CURVE_PARAMETERS);
1717         goto err;
1718     }
1719     group->field_mod_func = BN_nist_mod_521;
1720     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1721  err:
1722     BN_CTX_end(ctx);
1723     if (new_ctx != NULL)
1724         BN_CTX_free(new_ctx);
1725     return ret;
1726 }
1727
1728 /*
1729  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1730  * (X/Z^2, Y/Z^3)
1731  */
1732 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1733                                                  const EC_POINT *point,
1734                                                  BIGNUM *x, BIGNUM *y,
1735                                                  BN_CTX *ctx)
1736 {
1737     felem z1, z2, x_in, y_in, x_out, y_out;
1738     largefelem tmp;
1739
1740     if (EC_POINT_is_at_infinity(group, point)) {
1741         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1742               EC_R_POINT_AT_INFINITY);
1743         return 0;
1744     }
1745     if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1746         (!BN_to_felem(z1, &point->Z)))
1747         return 0;
1748     felem_inv(z2, z1);
1749     felem_square(tmp, z2);
1750     felem_reduce(z1, tmp);
1751     felem_mul(tmp, x_in, z1);
1752     felem_reduce(x_in, tmp);
1753     felem_contract(x_out, x_in);
1754     if (x != NULL) {
1755         if (!felem_to_BN(x, x_out)) {
1756             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1757                   ERR_R_BN_LIB);
1758             return 0;
1759         }
1760     }
1761     felem_mul(tmp, z1, z2);
1762     felem_reduce(z1, tmp);
1763     felem_mul(tmp, y_in, z1);
1764     felem_reduce(y_in, tmp);
1765     felem_contract(y_out, y_in);
1766     if (y != NULL) {
1767         if (!felem_to_BN(y, y_out)) {
1768             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1769                   ERR_R_BN_LIB);
1770             return 0;
1771         }
1772     }
1773     return 1;
1774 }
1775
1776 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1777 static void make_points_affine(size_t num, felem points[][3],
1778                                felem tmp_felems[])
1779 {
1780     /*
1781      * Runs in constant time, unless an input is the point at infinity (which
1782      * normally shouldn't happen).
1783      */
1784     ec_GFp_nistp_points_make_affine_internal(num,
1785                                              points,
1786                                              sizeof(felem),
1787                                              tmp_felems,
1788                                              (void (*)(void *))felem_one,
1789                                              felem_is_zero_int,
1790                                              (void (*)(void *, const void *))
1791                                              felem_assign,
1792                                              (void (*)(void *, const void *))
1793                                              felem_square_reduce, (void (*)
1794                                                                    (void *,
1795                                                                     const void
1796                                                                     *,
1797                                                                     const void
1798                                                                     *))
1799                                              felem_mul_reduce,
1800                                              (void (*)(void *, const void *))
1801                                              felem_inv,
1802                                              (void (*)(void *, const void *))
1803                                              felem_contract);
1804 }
1805
1806 /*
1807  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1808  * values Result is stored in r (r can equal one of the inputs).
1809  */
1810 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1811                                const BIGNUM *scalar, size_t num,
1812                                const EC_POINT *points[],
1813                                const BIGNUM *scalars[], BN_CTX *ctx)
1814 {
1815     int ret = 0;
1816     int j;
1817     int mixed = 0;
1818     BN_CTX *new_ctx = NULL;
1819     BIGNUM *x, *y, *z, *tmp_scalar;
1820     felem_bytearray g_secret;
1821     felem_bytearray *secrets = NULL;
1822     felem(*pre_comp)[17][3] = NULL;
1823     felem *tmp_felems = NULL;
1824     felem_bytearray tmp;
1825     unsigned i, num_bytes;
1826     int have_pre_comp = 0;
1827     size_t num_points = num;
1828     felem x_in, y_in, z_in, x_out, y_out, z_out;
1829     NISTP521_PRE_COMP *pre = NULL;
1830     felem(*g_pre_comp)[3] = NULL;
1831     EC_POINT *generator = NULL;
1832     const EC_POINT *p = NULL;
1833     const BIGNUM *p_scalar = NULL;
1834
1835     if (ctx == NULL)
1836         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1837             return 0;
1838     BN_CTX_start(ctx);
1839     if (((x = BN_CTX_get(ctx)) == NULL) ||
1840         ((y = BN_CTX_get(ctx)) == NULL) ||
1841         ((z = BN_CTX_get(ctx)) == NULL) ||
1842         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1843         goto err;
1844
1845     if (scalar != NULL) {
1846         pre = EC_EX_DATA_get_data(group->extra_data,
1847                                   nistp521_pre_comp_dup,
1848                                   nistp521_pre_comp_free,
1849                                   nistp521_pre_comp_clear_free);
1850         if (pre)
1851             /* we have precomputation, try to use it */
1852             g_pre_comp = &pre->g_pre_comp[0];
1853         else
1854             /* try to use the standard precomputation */
1855             g_pre_comp = (felem(*)[3]) gmul;
1856         generator = EC_POINT_new(group);
1857         if (generator == NULL)
1858             goto err;
1859         /* get the generator from precomputation */
1860         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1861             !felem_to_BN(y, g_pre_comp[1][1]) ||
1862             !felem_to_BN(z, g_pre_comp[1][2])) {
1863             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1864             goto err;
1865         }
1866         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1867                                                       generator, x, y, z,
1868                                                       ctx))
1869             goto err;
1870         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1871             /* precomputation matches generator */
1872             have_pre_comp = 1;
1873         else
1874             /*
1875              * we don't have valid precomputation: treat the generator as a
1876              * random point
1877              */
1878             num_points++;
1879     }
1880
1881     if (num_points > 0) {
1882         if (num_points >= 2) {
1883             /*
1884              * unless we precompute multiples for just one point, converting
1885              * those into affine form is time well spent
1886              */
1887             mixed = 1;
1888         }
1889         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1890         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1891         if (mixed)
1892             tmp_felems =
1893                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1894         if ((secrets == NULL) || (pre_comp == NULL)
1895             || (mixed && (tmp_felems == NULL))) {
1896             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1897             goto err;
1898         }
1899
1900         /*
1901          * we treat NULL scalars as 0, and NULL points as points at infinity,
1902          * i.e., they contribute nothing to the linear combination
1903          */
1904         memset(secrets, 0, num_points * sizeof(felem_bytearray));
1905         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1906         for (i = 0; i < num_points; ++i) {
1907             if (i == num)
1908                 /*
1909                  * we didn't have a valid precomputation, so we pick the
1910                  * generator
1911                  */
1912             {
1913                 p = EC_GROUP_get0_generator(group);
1914                 p_scalar = scalar;
1915             } else
1916                 /* the i^th point */
1917             {
1918                 p = points[i];
1919                 p_scalar = scalars[i];
1920             }
1921             if ((p_scalar != NULL) && (p != NULL)) {
1922                 /* reduce scalar to 0 <= scalar < 2^521 */
1923                 if ((BN_num_bits(p_scalar) > 521)
1924                     || (BN_is_negative(p_scalar))) {
1925                     /*
1926                      * this is an unusual input, and we don't guarantee
1927                      * constant-timeness
1928                      */
1929                     if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1930                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1931                         goto err;
1932                     }
1933                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1934                 } else
1935                     num_bytes = BN_bn2bin(p_scalar, tmp);
1936                 flip_endian(secrets[i], tmp, num_bytes);
1937                 /* precompute multiples */
1938                 if ((!BN_to_felem(x_out, &p->X)) ||
1939                     (!BN_to_felem(y_out, &p->Y)) ||
1940                     (!BN_to_felem(z_out, &p->Z)))
1941                     goto err;
1942                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1943                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1944                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1945                 for (j = 2; j <= 16; ++j) {
1946                     if (j & 1) {
1947                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1948                                   pre_comp[i][j][2], pre_comp[i][1][0],
1949                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1950                                   pre_comp[i][j - 1][0],
1951                                   pre_comp[i][j - 1][1],
1952                                   pre_comp[i][j - 1][2]);
1953                     } else {
1954                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1955                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1956                                      pre_comp[i][j / 2][1],
1957                                      pre_comp[i][j / 2][2]);
1958                     }
1959                 }
1960             }
1961         }
1962         if (mixed)
1963             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1964     }
1965
1966     /* the scalar for the generator */
1967     if ((scalar != NULL) && (have_pre_comp)) {
1968         memset(g_secret, 0, sizeof(g_secret));
1969         /* reduce scalar to 0 <= scalar < 2^521 */
1970         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1971             /*
1972              * this is an unusual input, and we don't guarantee
1973              * constant-timeness
1974              */
1975             if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1976                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1977                 goto err;
1978             }
1979             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1980         } else
1981             num_bytes = BN_bn2bin(scalar, tmp);
1982         flip_endian(g_secret, tmp, num_bytes);
1983         /* do the multiplication with generator precomputation */
1984         batch_mul(x_out, y_out, z_out,
1985                   (const felem_bytearray(*))secrets, num_points,
1986                   g_secret,
1987                   mixed, (const felem(*)[17][3])pre_comp,
1988                   (const felem(*)[3])g_pre_comp);
1989     } else
1990         /* do the multiplication without generator precomputation */
1991         batch_mul(x_out, y_out, z_out,
1992                   (const felem_bytearray(*))secrets, num_points,
1993                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1994     /* reduce the output to its unique minimal representation */
1995     felem_contract(x_in, x_out);
1996     felem_contract(y_in, y_out);
1997     felem_contract(z_in, z_out);
1998     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1999         (!felem_to_BN(z, z_in))) {
2000         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2001         goto err;
2002     }
2003     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2004
2005  err:
2006     BN_CTX_end(ctx);
2007     if (generator != NULL)
2008         EC_POINT_free(generator);
2009     if (new_ctx != NULL)
2010         BN_CTX_free(new_ctx);
2011     if (secrets != NULL)
2012         OPENSSL_free(secrets);
2013     if (pre_comp != NULL)
2014         OPENSSL_free(pre_comp);
2015     if (tmp_felems != NULL)
2016         OPENSSL_free(tmp_felems);
2017     return ret;
2018 }
2019
2020 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2021 {
2022     int ret = 0;
2023     NISTP521_PRE_COMP *pre = NULL;
2024     int i, j;
2025     BN_CTX *new_ctx = NULL;
2026     BIGNUM *x, *y;
2027     EC_POINT *generator = NULL;
2028     felem tmp_felems[16];
2029
2030     /* throw away old precomputation */
2031     EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2032                          nistp521_pre_comp_free,
2033                          nistp521_pre_comp_clear_free);
2034     if (ctx == NULL)
2035         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2036             return 0;
2037     BN_CTX_start(ctx);
2038     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2039         goto err;
2040     /* get the generator */
2041     if (group->generator == NULL)
2042         goto err;
2043     generator = EC_POINT_new(group);
2044     if (generator == NULL)
2045         goto err;
2046     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2047     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2048     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2049         goto err;
2050     if ((pre = nistp521_pre_comp_new()) == NULL)
2051         goto err;
2052     /*
2053      * if the generator is the standard one, use built-in precomputation
2054      */
2055     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2056         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2057         goto done;
2058     }
2059     if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2060         (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2061         (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2062         goto err;
2063     /* compute 2^130*G, 2^260*G, 2^390*G */
2064     for (i = 1; i <= 4; i <<= 1) {
2065         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2066                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2067                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2068         for (j = 0; j < 129; ++j) {
2069             point_double(pre->g_pre_comp[2 * i][0],
2070                          pre->g_pre_comp[2 * i][1],
2071                          pre->g_pre_comp[2 * i][2],
2072                          pre->g_pre_comp[2 * i][0],
2073                          pre->g_pre_comp[2 * i][1],
2074                          pre->g_pre_comp[2 * i][2]);
2075         }
2076     }
2077     /* g_pre_comp[0] is the point at infinity */
2078     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2079     /* the remaining multiples */
2080     /* 2^130*G + 2^260*G */
2081     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2082               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2083               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2084               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2085               pre->g_pre_comp[2][2]);
2086     /* 2^130*G + 2^390*G */
2087     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2088               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2089               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2090               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2091               pre->g_pre_comp[2][2]);
2092     /* 2^260*G + 2^390*G */
2093     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2094               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2095               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2096               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2097               pre->g_pre_comp[4][2]);
2098     /* 2^130*G + 2^260*G + 2^390*G */
2099     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2100               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2101               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2102               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2103               pre->g_pre_comp[2][2]);
2104     for (i = 1; i < 8; ++i) {
2105         /* odd multiples: add G */
2106         point_add(pre->g_pre_comp[2 * i + 1][0],
2107                   pre->g_pre_comp[2 * i + 1][1],
2108                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2109                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2110                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2111                   pre->g_pre_comp[1][2]);
2112     }
2113     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2114
2115  done:
2116     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2117                              nistp521_pre_comp_free,
2118                              nistp521_pre_comp_clear_free))
2119         goto err;
2120     ret = 1;
2121     pre = NULL;
2122  err:
2123     BN_CTX_end(ctx);
2124     if (generator != NULL)
2125         EC_POINT_free(generator);
2126     if (new_ctx != NULL)
2127         BN_CTX_free(new_ctx);
2128     if (pre)
2129         nistp521_pre_comp_free(pre);
2130     return ret;
2131 }
2132
2133 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2134 {
2135     if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2136                             nistp521_pre_comp_free,
2137                             nistp521_pre_comp_clear_free)
2138         != NULL)
2139         return 1;
2140     else
2141         return 0;
2142 }
2143
2144 #else
2145 static void *dummy = &dummy;
2146 #endif