crypto/openssl/crypto/ec/ecp_nistp224.c

   1 /*
   2  * Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
  28  *
  29  * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
  30  * and Adam Langley's public domain 64-bit C implementation of curve25519
  31  */
  32
  33 #include <openssl/opensslconf.h>
  34 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  35 NON_EMPTY_TRANSLATION_UNIT
  36 #else
  37
  38 # include <stdint.h>
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_local.h"
  42
  43 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Your compiler doesn't appear to support 128-bit integer types"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53
  54 /******************************************************************************/
  55 /*-
  56  * INTERNAL REPRESENTATION OF FIELD ELEMENTS
  57  *
  58  * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
  59  * using 64-bit coefficients called 'limbs',
  60  * and sometimes (for multiplication results) as
  61  * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
  62  * using 128-bit coefficients called 'widelimbs'.
  63  * A 4-limb representation is an 'felem';
  64  * a 7-widelimb representation is a 'widefelem'.
  65  * Even within felems, bits of adjacent limbs overlap, and we don't always
  66  * reduce the representations: we ensure that inputs to each felem
  67  * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
  68  * and fit into a 128-bit word without overflow. The coefficients are then
  69  * again partially reduced to obtain an felem satisfying a_i < 2^57.
  70  * We only reduce to the unique minimal representation at the end of the
  71  * computation.
  72  */
  73
  74 typedef uint64_t limb;
  75 typedef uint64_t limb_aX __attribute((__aligned__(1)));
  76 typedef uint128_t widelimb;
  77
  78 typedef limb felem[4];
  79 typedef widelimb widefelem[7];
  80
  81 /*
  82  * Field element represented as a byte array. 28*8 = 224 bits is also the
  83  * group order size for the elliptic curve, and we also use this type for
  84  * scalars for point multiplication.
  85  */
  86 typedef u8 felem_bytearray[28];
  87
  88 static const felem_bytearray nistp224_curve_params[5] = {
  89     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
  90      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
  91      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
  92     {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a */
  93      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
  94      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE},
  95     {0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, /* b */
  96      0x32, 0x56, 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA,
  97      0x27, 0x0B, 0x39, 0x43, 0x23, 0x55, 0xFF, 0xB4},
  98     {0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, /* x */
  99      0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22,
 100      0x34, 0x32, 0x80, 0xD6, 0x11, 0x5C, 0x1D, 0x21},
 101     {0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, /* y */
 102      0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64,
 103      0x44, 0xd5, 0x81, 0x99, 0x85, 0x00, 0x7e, 0x34}
 104 };
 105
 106 /*-
 107  * Precomputed multiples of the standard generator
 108  * Points are given in coordinates (X, Y, Z) where Z normally is 1
 109  * (0 for the point at infinity).
 110  * For each field element, slice a_0 is word 0, etc.
 111  *
 112  * The table has 2 * 16 elements, starting with the following:
 113  * index | bits    | point
 114  * ------+---------+------------------------------
 115  *     0 | 0 0 0 0 | 0G
 116  *     1 | 0 0 0 1 | 1G
 117  *     2 | 0 0 1 0 | 2^56G
 118  *     3 | 0 0 1 1 | (2^56 + 1)G
 119  *     4 | 0 1 0 0 | 2^112G
 120  *     5 | 0 1 0 1 | (2^112 + 1)G
 121  *     6 | 0 1 1 0 | (2^112 + 2^56)G
 122  *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
 123  *     8 | 1 0 0 0 | 2^168G
 124  *     9 | 1 0 0 1 | (2^168 + 1)G
 125  *    10 | 1 0 1 0 | (2^168 + 2^56)G
 126  *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
 127  *    12 | 1 1 0 0 | (2^168 + 2^112)G
 128  *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
 129  *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
 130  *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
 131  * followed by a copy of this with each element multiplied by 2^28.
 132  *
 133  * The reason for this is so that we can clock bits into four different
 134  * locations when doing simple scalar multiplies against the base point,
 135  * and then another four locations using the second 16 elements.
 136  */
 137 static const felem gmul[2][16][3] = {
 138 {{{0, 0, 0, 0},
 139   {0, 0, 0, 0},
 140   {0, 0, 0, 0}},
 141  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
 142   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
 143   {1, 0, 0, 0}},
 144  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
 145   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
 146   {1, 0, 0, 0}},
 147  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
 148   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
 149   {1, 0, 0, 0}},
 150  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
 151   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
 152   {1, 0, 0, 0}},
 153  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
 154   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
 155   {1, 0, 0, 0}},
 156  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
 157   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
 158   {1, 0, 0, 0}},
 159  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
 160   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
 161   {1, 0, 0, 0}},
 162  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
 163   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
 164   {1, 0, 0, 0}},
 165  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
 166   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
 167   {1, 0, 0, 0}},
 168  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
 169   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
 170   {1, 0, 0, 0}},
 171  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
 172   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
 173   {1, 0, 0, 0}},
 174  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
 175   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
 176   {1, 0, 0, 0}},
 177  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
 178   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
 179   {1, 0, 0, 0}},
 180  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
 181   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
 182   {1, 0, 0, 0}},
 183  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
 184   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
 185   {1, 0, 0, 0}}},
 186 {{{0, 0, 0, 0},
 187   {0, 0, 0, 0},
 188   {0, 0, 0, 0}},
 189  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
 190   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
 191   {1, 0, 0, 0}},
 192  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
 193   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
 194   {1, 0, 0, 0}},
 195  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
 196   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
 197   {1, 0, 0, 0}},
 198  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
 199   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
 200   {1, 0, 0, 0}},
 201  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
 202   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
 203   {1, 0, 0, 0}},
 204  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
 205   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
 206   {1, 0, 0, 0}},
 207  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
 208   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
 209   {1, 0, 0, 0}},
 210  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
 211   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
 212   {1, 0, 0, 0}},
 213  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
 214   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
 215   {1, 0, 0, 0}},
 216  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
 217   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
 218   {1, 0, 0, 0}},
 219  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
 220   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
 221   {1, 0, 0, 0}},
 222  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
 223   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
 224   {1, 0, 0, 0}},
 225  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
 226   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
 227   {1, 0, 0, 0}},
 228  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
 229   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
 230   {1, 0, 0, 0}},
 231  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
 232   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
 233   {1, 0, 0, 0}}}
 234 };
 235
 236 /* Precomputation for the group generator. */
 237 struct nistp224_pre_comp_st {
 238     felem g_pre_comp[2][16][3];
 239     CRYPTO_REF_COUNT references;
 240     CRYPTO_RWLOCK *lock;
 241 };
 242
 243 const EC_METHOD *EC_GFp_nistp224_method(void)
 244 {
 245     static const EC_METHOD ret = {
 246         EC_FLAGS_DEFAULT_OCT,
 247         NID_X9_62_prime_field,
 248         ec_GFp_nistp224_group_init,
 249         ec_GFp_simple_group_finish,
 250         ec_GFp_simple_group_clear_finish,
 251         ec_GFp_nist_group_copy,
 252         ec_GFp_nistp224_group_set_curve,
 253         ec_GFp_simple_group_get_curve,
 254         ec_GFp_simple_group_get_degree,
 255         ec_group_simple_order_bits,
 256         ec_GFp_simple_group_check_discriminant,
 257         ec_GFp_simple_point_init,
 258         ec_GFp_simple_point_finish,
 259         ec_GFp_simple_point_clear_finish,
 260         ec_GFp_simple_point_copy,
 261         ec_GFp_simple_point_set_to_infinity,
 262         ec_GFp_simple_set_Jprojective_coordinates_GFp,
 263         ec_GFp_simple_get_Jprojective_coordinates_GFp,
 264         ec_GFp_simple_point_set_affine_coordinates,
 265         ec_GFp_nistp224_point_get_affine_coordinates,
 266         0 /* point_set_compressed_coordinates */ ,
 267         0 /* point2oct */ ,
 268         0 /* oct2point */ ,
 269         ec_GFp_simple_add,
 270         ec_GFp_simple_dbl,
 271         ec_GFp_simple_invert,
 272         ec_GFp_simple_is_at_infinity,
 273         ec_GFp_simple_is_on_curve,
 274         ec_GFp_simple_cmp,
 275         ec_GFp_simple_make_affine,
 276         ec_GFp_simple_points_make_affine,
 277         ec_GFp_nistp224_points_mul,
 278         ec_GFp_nistp224_precompute_mult,
 279         ec_GFp_nistp224_have_precompute_mult,
 280         ec_GFp_nist_field_mul,
 281         ec_GFp_nist_field_sqr,
 282         0 /* field_div */ ,
 283         ec_GFp_simple_field_inv,
 284         0 /* field_encode */ ,
 285         0 /* field_decode */ ,
 286         0,                      /* field_set_to_one */
 287         ec_key_simple_priv2oct,
 288         ec_key_simple_oct2priv,
 289         0, /* set private */
 290         ec_key_simple_generate_key,
 291         ec_key_simple_check_key,
 292         ec_key_simple_generate_public_key,
 293         0, /* keycopy */
 294         0, /* keyfinish */
 295         ecdh_simple_compute_key,
 296         0, /* field_inverse_mod_ord */
 297         0, /* blind_coordinates */
 298         0, /* ladder_pre */
 299         0, /* ladder_step */
 300         0  /* ladder_post */
 301     };
 302
 303     return &ret;
 304 }
 305
 306 /*
 307  * Helper functions to convert field elements to/from internal representation
 308  */
 309 static void bin28_to_felem(felem out, const u8 in[28])
 310 {
 311     out[0] = *((const limb *)(in)) & 0x00ffffffffffffff;
 312     out[1] = (*((const limb_aX *)(in + 7))) & 0x00ffffffffffffff;
 313     out[2] = (*((const limb_aX *)(in + 14))) & 0x00ffffffffffffff;
 314     out[3] = (*((const limb_aX *)(in + 20))) >> 8;
 315 }
 316
 317 static void felem_to_bin28(u8 out[28], const felem in)
 318 {
 319     unsigned i;
 320     for (i = 0; i < 7; ++i) {
 321         out[i] = in[0] >> (8 * i);
 322         out[i + 7] = in[1] >> (8 * i);
 323         out[i + 14] = in[2] >> (8 * i);
 324         out[i + 21] = in[3] >> (8 * i);
 325     }
 326 }
 327
 328 /* From OpenSSL BIGNUM to internal representation */
 329 static int BN_to_felem(felem out, const BIGNUM *bn)
 330 {
 331     felem_bytearray b_out;
 332     int num_bytes;
 333
 334     if (BN_is_negative(bn)) {
 335         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 336         return 0;
 337     }
 338     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 339     if (num_bytes < 0) {
 340         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 341         return 0;
 342     }
 343     bin28_to_felem(out, b_out);
 344     return 1;
 345 }
 346
 347 /* From internal representation to OpenSSL BIGNUM */
 348 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 349 {
 350     felem_bytearray b_out;
 351     felem_to_bin28(b_out, in);
 352     return BN_lebin2bn(b_out, sizeof(b_out), out);
 353 }
 354
 355 /******************************************************************************/
 356 /*-
 357  *                              FIELD OPERATIONS
 358  *
 359  * Field operations, using the internal representation of field elements.
 360  * NB! These operations are specific to our point multiplication and cannot be
 361  * expected to be correct in general - e.g., multiplication with a large scalar
 362  * will cause an overflow.
 363  *
 364  */
 365
 366 static void felem_one(felem out)
 367 {
 368     out[0] = 1;
 369     out[1] = 0;
 370     out[2] = 0;
 371     out[3] = 0;
 372 }
 373
 374 static void felem_assign(felem out, const felem in)
 375 {
 376     out[0] = in[0];
 377     out[1] = in[1];
 378     out[2] = in[2];
 379     out[3] = in[3];
 380 }
 381
 382 /* Sum two field elements: out += in */
 383 static void felem_sum(felem out, const felem in)
 384 {
 385     out[0] += in[0];
 386     out[1] += in[1];
 387     out[2] += in[2];
 388     out[3] += in[3];
 389 }
 390
 391 /* Subtract field elements: out -= in */
 392 /* Assumes in[i] < 2^57 */
 393 static void felem_diff(felem out, const felem in)
 394 {
 395     static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
 396     static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
 397     static const limb two58m42m2 = (((limb) 1) << 58) -
 398         (((limb) 1) << 42) - (((limb) 1) << 2);
 399
 400     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 401     out[0] += two58p2;
 402     out[1] += two58m42m2;
 403     out[2] += two58m2;
 404     out[3] += two58m2;
 405
 406     out[0] -= in[0];
 407     out[1] -= in[1];
 408     out[2] -= in[2];
 409     out[3] -= in[3];
 410 }
 411
 412 /* Subtract in unreduced 128-bit mode: out -= in */
 413 /* Assumes in[i] < 2^119 */
 414 static void widefelem_diff(widefelem out, const widefelem in)
 415 {
 416     static const widelimb two120 = ((widelimb) 1) << 120;
 417     static const widelimb two120m64 = (((widelimb) 1) << 120) -
 418         (((widelimb) 1) << 64);
 419     static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
 420         (((widelimb) 1) << 104) - (((widelimb) 1) << 64);
 421
 422     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 423     out[0] += two120;
 424     out[1] += two120m64;
 425     out[2] += two120m64;
 426     out[3] += two120;
 427     out[4] += two120m104m64;
 428     out[5] += two120m64;
 429     out[6] += two120m64;
 430
 431     out[0] -= in[0];
 432     out[1] -= in[1];
 433     out[2] -= in[2];
 434     out[3] -= in[3];
 435     out[4] -= in[4];
 436     out[5] -= in[5];
 437     out[6] -= in[6];
 438 }
 439
 440 /* Subtract in mixed mode: out128 -= in64 */
 441 /* in[i] < 2^63 */
 442 static void felem_diff_128_64(widefelem out, const felem in)
 443 {
 444     static const widelimb two64p8 = (((widelimb) 1) << 64) +
 445         (((widelimb) 1) << 8);
 446     static const widelimb two64m8 = (((widelimb) 1) << 64) -
 447         (((widelimb) 1) << 8);
 448     static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
 449         (((widelimb) 1) << 48) - (((widelimb) 1) << 8);
 450
 451     /* Add 0 mod 2^224-2^96+1 to ensure out > in */
 452     out[0] += two64p8;
 453     out[1] += two64m48m8;
 454     out[2] += two64m8;
 455     out[3] += two64m8;
 456
 457     out[0] -= in[0];
 458     out[1] -= in[1];
 459     out[2] -= in[2];
 460     out[3] -= in[3];
 461 }
 462
 463 /*
 464  * Multiply a field element by a scalar: out = out * scalar The scalars we
 465  * actually use are small, so results fit without overflow
 466  */
 467 static void felem_scalar(felem out, const limb scalar)
 468 {
 469     out[0] *= scalar;
 470     out[1] *= scalar;
 471     out[2] *= scalar;
 472     out[3] *= scalar;
 473 }
 474
 475 /*
 476  * Multiply an unreduced field element by a scalar: out = out * scalar The
 477  * scalars we actually use are small, so results fit without overflow
 478  */
 479 static void widefelem_scalar(widefelem out, const widelimb scalar)
 480 {
 481     out[0] *= scalar;
 482     out[1] *= scalar;
 483     out[2] *= scalar;
 484     out[3] *= scalar;
 485     out[4] *= scalar;
 486     out[5] *= scalar;
 487     out[6] *= scalar;
 488 }
 489
 490 /* Square a field element: out = in^2 */
 491 static void felem_square(widefelem out, const felem in)
 492 {
 493     limb tmp0, tmp1, tmp2;
 494     tmp0 = 2 * in[0];
 495     tmp1 = 2 * in[1];
 496     tmp2 = 2 * in[2];
 497     out[0] = ((widelimb) in[0]) * in[0];
 498     out[1] = ((widelimb) in[0]) * tmp1;
 499     out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
 500     out[3] = ((widelimb) in[3]) * tmp0 + ((widelimb) in[1]) * tmp2;
 501     out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
 502     out[5] = ((widelimb) in[3]) * tmp2;
 503     out[6] = ((widelimb) in[3]) * in[3];
 504 }
 505
 506 /* Multiply two field elements: out = in1 * in2 */
 507 static void felem_mul(widefelem out, const felem in1, const felem in2)
 508 {
 509     out[0] = ((widelimb) in1[0]) * in2[0];
 510     out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
 511     out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
 512              ((widelimb) in1[2]) * in2[0];
 513     out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
 514              ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
 515     out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
 516              ((widelimb) in1[3]) * in2[1];
 517     out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
 518     out[6] = ((widelimb) in1[3]) * in2[3];
 519 }
 520
 521 /*-
 522  * Reduce seven 128-bit coefficients to four 64-bit coefficients.
 523  * Requires in[i] < 2^126,
 524  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
 525 static void felem_reduce(felem out, const widefelem in)
 526 {
 527     static const widelimb two127p15 = (((widelimb) 1) << 127) +
 528         (((widelimb) 1) << 15);
 529     static const widelimb two127m71 = (((widelimb) 1) << 127) -
 530         (((widelimb) 1) << 71);
 531     static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
 532         (((widelimb) 1) << 71) - (((widelimb) 1) << 55);
 533     widelimb output[5];
 534
 535     /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
 536     output[0] = in[0] + two127p15;
 537     output[1] = in[1] + two127m71m55;
 538     output[2] = in[2] + two127m71;
 539     output[3] = in[3];
 540     output[4] = in[4];
 541
 542     /* Eliminate in[4], in[5], in[6] */
 543     output[4] += in[6] >> 16;
 544     output[3] += (in[6] & 0xffff) << 40;
 545     output[2] -= in[6];
 546
 547     output[3] += in[5] >> 16;
 548     output[2] += (in[5] & 0xffff) << 40;
 549     output[1] -= in[5];
 550
 551     output[2] += output[4] >> 16;
 552     output[1] += (output[4] & 0xffff) << 40;
 553     output[0] -= output[4];
 554
 555     /* Carry 2 -> 3 -> 4 */
 556     output[3] += output[2] >> 56;
 557     output[2] &= 0x00ffffffffffffff;
 558
 559     output[4] = output[3] >> 56;
 560     output[3] &= 0x00ffffffffffffff;
 561
 562     /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
 563
 564     /* Eliminate output[4] */
 565     output[2] += output[4] >> 16;
 566     /* output[2] < 2^56 + 2^56 = 2^57 */
 567     output[1] += (output[4] & 0xffff) << 40;
 568     output[0] -= output[4];
 569
 570     /* Carry 0 -> 1 -> 2 -> 3 */
 571     output[1] += output[0] >> 56;
 572     out[0] = output[0] & 0x00ffffffffffffff;
 573
 574     output[2] += output[1] >> 56;
 575     /* output[2] < 2^57 + 2^72 */
 576     out[1] = output[1] & 0x00ffffffffffffff;
 577     output[3] += output[2] >> 56;
 578     /* output[3] <= 2^56 + 2^16 */
 579     out[2] = output[2] & 0x00ffffffffffffff;
 580
 581     /*-
 582      * out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
 583      * out[3] <= 2^56 + 2^16 (due to final carry),
 584      * so out < 2*p
 585      */
 586     out[3] = output[3];
 587 }
 588
 589 static void felem_square_reduce(felem out, const felem in)
 590 {
 591     widefelem tmp;
 592     felem_square(tmp, in);
 593     felem_reduce(out, tmp);
 594 }
 595
 596 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 597 {
 598     widefelem tmp;
 599     felem_mul(tmp, in1, in2);
 600     felem_reduce(out, tmp);
 601 }
 602
 603 /*
 604  * Reduce to unique minimal representation. Requires 0 <= in < 2*p (always
 605  * call felem_reduce first)
 606  */
 607 static void felem_contract(felem out, const felem in)
 608 {
 609     static const int64_t two56 = ((limb) 1) << 56;
 610     /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
 611     /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
 612     int64_t tmp[4], a;
 613     tmp[0] = in[0];
 614     tmp[1] = in[1];
 615     tmp[2] = in[2];
 616     tmp[3] = in[3];
 617     /* Case 1: a = 1 iff in >= 2^224 */
 618     a = (in[3] >> 56);
 619     tmp[0] -= a;
 620     tmp[1] += a << 40;
 621     tmp[3] &= 0x00ffffffffffffff;
 622     /*
 623      * Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1
 624      * and the lower part is non-zero
 625      */
 626     a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
 627         (((int64_t) (in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
 628     a &= 0x00ffffffffffffff;
 629     /* turn a into an all-one mask (if a = 0) or an all-zero mask */
 630     a = (a - 1) >> 63;
 631     /* subtract 2^224 - 2^96 + 1 if a is all-one */
 632     tmp[3] &= a ^ 0xffffffffffffffff;
 633     tmp[2] &= a ^ 0xffffffffffffffff;
 634     tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
 635     tmp[0] -= 1 & a;
 636
 637     /*
 638      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
 639      * non-zero, so we only need one step
 640      */
 641     a = tmp[0] >> 63;
 642     tmp[0] += two56 & a;
 643     tmp[1] -= 1 & a;
 644
 645     /* carry 1 -> 2 -> 3 */
 646     tmp[2] += tmp[1] >> 56;
 647     tmp[1] &= 0x00ffffffffffffff;
 648
 649     tmp[3] += tmp[2] >> 56;
 650     tmp[2] &= 0x00ffffffffffffff;
 651
 652     /* Now 0 <= out < p */
 653     out[0] = tmp[0];
 654     out[1] = tmp[1];
 655     out[2] = tmp[2];
 656     out[3] = tmp[3];
 657 }
 658
 659 /*
 660  * Get negative value: out = -in
 661  * Requires in[i] < 2^63,
 662  * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16
 663  */
 664 static void felem_neg(felem out, const felem in)
 665 {
 666     widefelem tmp = {0};
 667     felem_diff_128_64(tmp, in);
 668     felem_reduce(out, tmp);
 669 }
 670
 671 /*
 672  * Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
 673  * elements are reduced to in < 2^225, so we only need to check three cases:
 674  * 0, 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
 675  */
 676 static limb felem_is_zero(const felem in)
 677 {
 678     limb zero, two224m96p1, two225m97p2;
 679
 680     zero = in[0] | in[1] | in[2] | in[3];
 681     zero = (((int64_t) (zero) - 1) >> 63) & 1;
 682     two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
 683         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
 684     two224m96p1 = (((int64_t) (two224m96p1) - 1) >> 63) & 1;
 685     two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
 686         | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
 687     two225m97p2 = (((int64_t) (two225m97p2) - 1) >> 63) & 1;
 688     return (zero | two224m96p1 | two225m97p2);
 689 }
 690
 691 static int felem_is_zero_int(const void *in)
 692 {
 693     return (int)(felem_is_zero(in) & ((limb) 1));
 694 }
 695
 696 /* Invert a field element */
 697 /* Computation chain copied from djb's code */
 698 static void felem_inv(felem out, const felem in)
 699 {
 700     felem ftmp, ftmp2, ftmp3, ftmp4;
 701     widefelem tmp;
 702     unsigned i;
 703
 704     felem_square(tmp, in);
 705     felem_reduce(ftmp, tmp);    /* 2 */
 706     felem_mul(tmp, in, ftmp);
 707     felem_reduce(ftmp, tmp);    /* 2^2 - 1 */
 708     felem_square(tmp, ftmp);
 709     felem_reduce(ftmp, tmp);    /* 2^3 - 2 */
 710     felem_mul(tmp, in, ftmp);
 711     felem_reduce(ftmp, tmp);    /* 2^3 - 1 */
 712     felem_square(tmp, ftmp);
 713     felem_reduce(ftmp2, tmp);   /* 2^4 - 2 */
 714     felem_square(tmp, ftmp2);
 715     felem_reduce(ftmp2, tmp);   /* 2^5 - 4 */
 716     felem_square(tmp, ftmp2);
 717     felem_reduce(ftmp2, tmp);   /* 2^6 - 8 */
 718     felem_mul(tmp, ftmp2, ftmp);
 719     felem_reduce(ftmp, tmp);    /* 2^6 - 1 */
 720     felem_square(tmp, ftmp);
 721     felem_reduce(ftmp2, tmp);   /* 2^7 - 2 */
 722     for (i = 0; i < 5; ++i) {   /* 2^12 - 2^6 */
 723         felem_square(tmp, ftmp2);
 724         felem_reduce(ftmp2, tmp);
 725     }
 726     felem_mul(tmp, ftmp2, ftmp);
 727     felem_reduce(ftmp2, tmp);   /* 2^12 - 1 */
 728     felem_square(tmp, ftmp2);
 729     felem_reduce(ftmp3, tmp);   /* 2^13 - 2 */
 730     for (i = 0; i < 11; ++i) {  /* 2^24 - 2^12 */
 731         felem_square(tmp, ftmp3);
 732         felem_reduce(ftmp3, tmp);
 733     }
 734     felem_mul(tmp, ftmp3, ftmp2);
 735     felem_reduce(ftmp2, tmp);   /* 2^24 - 1 */
 736     felem_square(tmp, ftmp2);
 737     felem_reduce(ftmp3, tmp);   /* 2^25 - 2 */
 738     for (i = 0; i < 23; ++i) {  /* 2^48 - 2^24 */
 739         felem_square(tmp, ftmp3);
 740         felem_reduce(ftmp3, tmp);
 741     }
 742     felem_mul(tmp, ftmp3, ftmp2);
 743     felem_reduce(ftmp3, tmp);   /* 2^48 - 1 */
 744     felem_square(tmp, ftmp3);
 745     felem_reduce(ftmp4, tmp);   /* 2^49 - 2 */
 746     for (i = 0; i < 47; ++i) {  /* 2^96 - 2^48 */
 747         felem_square(tmp, ftmp4);
 748         felem_reduce(ftmp4, tmp);
 749     }
 750     felem_mul(tmp, ftmp3, ftmp4);
 751     felem_reduce(ftmp3, tmp);   /* 2^96 - 1 */
 752     felem_square(tmp, ftmp3);
 753     felem_reduce(ftmp4, tmp);   /* 2^97 - 2 */
 754     for (i = 0; i < 23; ++i) {  /* 2^120 - 2^24 */
 755         felem_square(tmp, ftmp4);
 756         felem_reduce(ftmp4, tmp);
 757     }
 758     felem_mul(tmp, ftmp2, ftmp4);
 759     felem_reduce(ftmp2, tmp);   /* 2^120 - 1 */
 760     for (i = 0; i < 6; ++i) {   /* 2^126 - 2^6 */
 761         felem_square(tmp, ftmp2);
 762         felem_reduce(ftmp2, tmp);
 763     }
 764     felem_mul(tmp, ftmp2, ftmp);
 765     felem_reduce(ftmp, tmp);    /* 2^126 - 1 */
 766     felem_square(tmp, ftmp);
 767     felem_reduce(ftmp, tmp);    /* 2^127 - 2 */
 768     felem_mul(tmp, ftmp, in);
 769     felem_reduce(ftmp, tmp);    /* 2^127 - 1 */
 770     for (i = 0; i < 97; ++i) {  /* 2^224 - 2^97 */
 771         felem_square(tmp, ftmp);
 772         felem_reduce(ftmp, tmp);
 773     }
 774     felem_mul(tmp, ftmp, ftmp3);
 775     felem_reduce(out, tmp);     /* 2^224 - 2^96 - 1 */
 776 }
 777
 778 /*
 779  * Copy in constant time: if icopy == 1, copy in to out, if icopy == 0, copy
 780  * out to itself.
 781  */
 782 static void copy_conditional(felem out, const felem in, limb icopy)
 783 {
 784     unsigned i;
 785     /*
 786      * icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
 787      */
 788     const limb copy = -icopy;
 789     for (i = 0; i < 4; ++i) {
 790         const limb tmp = copy & (in[i] ^ out[i]);
 791         out[i] ^= tmp;
 792     }
 793 }
 794
 795 /******************************************************************************/
 796 /*-
 797  *                       ELLIPTIC CURVE POINT OPERATIONS
 798  *
 799  * Points are represented in Jacobian projective coordinates:
 800  * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
 801  * or to the point at infinity if Z == 0.
 802  *
 803  */
 804
 805 /*-
 806  * Double an elliptic curve point:
 807  * (X', Y', Z') = 2 * (X, Y, Z), where
 808  * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
 809  * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^4
 810  * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
 811  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
 812  * while x_out == y_in is not (maybe this works, but it's not tested).
 813  */
 814 static void
 815 point_double(felem x_out, felem y_out, felem z_out,
 816              const felem x_in, const felem y_in, const felem z_in)
 817 {
 818     widefelem tmp, tmp2;
 819     felem delta, gamma, beta, alpha, ftmp, ftmp2;
 820
 821     felem_assign(ftmp, x_in);
 822     felem_assign(ftmp2, x_in);
 823
 824     /* delta = z^2 */
 825     felem_square(tmp, z_in);
 826     felem_reduce(delta, tmp);
 827
 828     /* gamma = y^2 */
 829     felem_square(tmp, y_in);
 830     felem_reduce(gamma, tmp);
 831
 832     /* beta = x*gamma */
 833     felem_mul(tmp, x_in, gamma);
 834     felem_reduce(beta, tmp);
 835
 836     /* alpha = 3*(x-delta)*(x+delta) */
 837     felem_diff(ftmp, delta);
 838     /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
 839     felem_sum(ftmp2, delta);
 840     /* ftmp2[i] < 2^57 + 2^57 = 2^58 */
 841     felem_scalar(ftmp2, 3);
 842     /* ftmp2[i] < 3 * 2^58 < 2^60 */
 843     felem_mul(tmp, ftmp, ftmp2);
 844     /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
 845     felem_reduce(alpha, tmp);
 846
 847     /* x' = alpha^2 - 8*beta */
 848     felem_square(tmp, alpha);
 849     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 850     felem_assign(ftmp, beta);
 851     felem_scalar(ftmp, 8);
 852     /* ftmp[i] < 8 * 2^57 = 2^60 */
 853     felem_diff_128_64(tmp, ftmp);
 854     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 855     felem_reduce(x_out, tmp);
 856
 857     /* z' = (y + z)^2 - gamma - delta */
 858     felem_sum(delta, gamma);
 859     /* delta[i] < 2^57 + 2^57 = 2^58 */
 860     felem_assign(ftmp, y_in);
 861     felem_sum(ftmp, z_in);
 862     /* ftmp[i] < 2^57 + 2^57 = 2^58 */
 863     felem_square(tmp, ftmp);
 864     /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
 865     felem_diff_128_64(tmp, delta);
 866     /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
 867     felem_reduce(z_out, tmp);
 868
 869     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
 870     felem_scalar(beta, 4);
 871     /* beta[i] < 4 * 2^57 = 2^59 */
 872     felem_diff(beta, x_out);
 873     /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
 874     felem_mul(tmp, alpha, beta);
 875     /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
 876     felem_square(tmp2, gamma);
 877     /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
 878     widefelem_scalar(tmp2, 8);
 879     /* tmp2[i] < 8 * 2^116 = 2^119 */
 880     widefelem_diff(tmp, tmp2);
 881     /* tmp[i] < 2^119 + 2^120 < 2^121 */
 882     felem_reduce(y_out, tmp);
 883 }
 884
 885 /*-
 886  * Add two elliptic curve points:
 887  * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
 888  * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
 889  * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
 890  * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
 891  *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
 892  * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
 893  *
 894  * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
 895  */
 896
 897 /*
 898  * This function is not entirely constant-time: it includes a branch for
 899  * checking whether the two input points are equal, (while not equal to the
 900  * point at infinity). This case never happens during single point
 901  * multiplication, so there is no timing leak for ECDH or ECDSA signing.
 902  */
 903 static void point_add(felem x3, felem y3, felem z3,
 904                       const felem x1, const felem y1, const felem z1,
 905                       const int mixed, const felem x2, const felem y2,
 906                       const felem z2)
 907 {
 908     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
 909     widefelem tmp, tmp2;
 910     limb z1_is_zero, z2_is_zero, x_equal, y_equal;
 911     limb points_equal;
 912
 913     if (!mixed) {
 914         /* ftmp2 = z2^2 */
 915         felem_square(tmp, z2);
 916         felem_reduce(ftmp2, tmp);
 917
 918         /* ftmp4 = z2^3 */
 919         felem_mul(tmp, ftmp2, z2);
 920         felem_reduce(ftmp4, tmp);
 921
 922         /* ftmp4 = z2^3*y1 */
 923         felem_mul(tmp2, ftmp4, y1);
 924         felem_reduce(ftmp4, tmp2);
 925
 926         /* ftmp2 = z2^2*x1 */
 927         felem_mul(tmp2, ftmp2, x1);
 928         felem_reduce(ftmp2, tmp2);
 929     } else {
 930         /*
 931          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
 932          */
 933
 934         /* ftmp4 = z2^3*y1 */
 935         felem_assign(ftmp4, y1);
 936
 937         /* ftmp2 = z2^2*x1 */
 938         felem_assign(ftmp2, x1);
 939     }
 940
 941     /* ftmp = z1^2 */
 942     felem_square(tmp, z1);
 943     felem_reduce(ftmp, tmp);
 944
 945     /* ftmp3 = z1^3 */
 946     felem_mul(tmp, ftmp, z1);
 947     felem_reduce(ftmp3, tmp);
 948
 949     /* tmp = z1^3*y2 */
 950     felem_mul(tmp, ftmp3, y2);
 951     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 952
 953     /* ftmp3 = z1^3*y2 - z2^3*y1 */
 954     felem_diff_128_64(tmp, ftmp4);
 955     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 956     felem_reduce(ftmp3, tmp);
 957
 958     /* tmp = z1^2*x2 */
 959     felem_mul(tmp, ftmp, x2);
 960     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
 961
 962     /* ftmp = z1^2*x2 - z2^2*x1 */
 963     felem_diff_128_64(tmp, ftmp2);
 964     /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
 965     felem_reduce(ftmp, tmp);
 966
 967     /*
 968      * The formulae are incorrect if the points are equal, in affine coordinates
 969      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
 970      * happens.
 971      *
 972      * We use bitwise operations to avoid potential side-channels introduced by
 973      * the short-circuiting behaviour of boolean operators.
 974      */
 975     x_equal = felem_is_zero(ftmp);
 976     y_equal = felem_is_zero(ftmp3);
 977     /*
 978      * The special case of either point being the point at infinity (z1 and/or
 979      * z2 are zero), is handled separately later on in this function, so we
 980      * avoid jumping to point_double here in those special cases.
 981      */
 982     z1_is_zero = felem_is_zero(z1);
 983     z2_is_zero = felem_is_zero(z2);
 984
 985     /*
 986      * Compared to `ecp_nistp256.c` and `ecp_nistp521.c`, in this
 987      * specific implementation `felem_is_zero()` returns truth as `0x1`
 988      * (rather than `0xff..ff`).
 989      *
 990      * This implies that `~true` in this implementation becomes
 991      * `0xff..fe` (rather than `0x0`): for this reason, to be used in
 992      * the if expression, we mask out only the last bit in the next
 993      * line.
 994      */
 995     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero)) & 1;
 996
 997     if (points_equal) {
 998         /*
 999          * This is obviously not constant-time but, as mentioned before, this
1000          * case never happens during single point multiplication, so there is no
1001          * timing leak for ECDH or ECDSA signing.
1002          */
1003         point_double(x3, y3, z3, x1, y1, z1);
1004         return;
1005     }
1006
1007     /* ftmp5 = z1*z2 */
1008     if (!mixed) {
1009         felem_mul(tmp, z1, z2);
1010         felem_reduce(ftmp5, tmp);
1011     } else {
1012         /* special case z2 = 0 is handled later */
1013         felem_assign(ftmp5, z1);
1014     }
1015
1016     /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
1017     felem_mul(tmp, ftmp, ftmp5);
1018     felem_reduce(z_out, tmp);
1019
1020     /* ftmp = (z1^2*x2 - z2^2*x1)^2 */
1021     felem_assign(ftmp5, ftmp);
1022     felem_square(tmp, ftmp);
1023     felem_reduce(ftmp, tmp);
1024
1025     /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
1026     felem_mul(tmp, ftmp, ftmp5);
1027     felem_reduce(ftmp5, tmp);
1028
1029     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1030     felem_mul(tmp, ftmp2, ftmp);
1031     felem_reduce(ftmp2, tmp);
1032
1033     /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
1034     felem_mul(tmp, ftmp4, ftmp5);
1035     /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
1036
1037     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
1038     felem_square(tmp2, ftmp3);
1039     /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
1040
1041     /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
1042     felem_diff_128_64(tmp2, ftmp5);
1043     /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
1044
1045     /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
1046     felem_assign(ftmp5, ftmp2);
1047     felem_scalar(ftmp5, 2);
1048     /* ftmp5[i] < 2 * 2^57 = 2^58 */
1049
1050     /*-
1051      * x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
1052      *  2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
1053      */
1054     felem_diff_128_64(tmp2, ftmp5);
1055     /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
1056     felem_reduce(x_out, tmp2);
1057
1058     /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
1059     felem_diff(ftmp2, x_out);
1060     /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
1061
1062     /*
1063      * tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
1064      */
1065     felem_mul(tmp2, ftmp3, ftmp2);
1066     /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
1067
1068     /*-
1069      * y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
1070      *  z2^3*y1*(z1^2*x2 - z2^2*x1)^3
1071      */
1072     widefelem_diff(tmp2, tmp);
1073     /* tmp2[i] < 2^118 + 2^120 < 2^121 */
1074     felem_reduce(y_out, tmp2);
1075
1076     /*
1077      * the result (x_out, y_out, z_out) is incorrect if one of the inputs is
1078      * the point at infinity, so we need to check for this separately
1079      */
1080
1081     /*
1082      * if point 1 is at infinity, copy point 2 to output, and vice versa
1083      */
1084     copy_conditional(x_out, x2, z1_is_zero);
1085     copy_conditional(x_out, x1, z2_is_zero);
1086     copy_conditional(y_out, y2, z1_is_zero);
1087     copy_conditional(y_out, y1, z2_is_zero);
1088     copy_conditional(z_out, z2, z1_is_zero);
1089     copy_conditional(z_out, z1, z2_is_zero);
1090     felem_assign(x3, x_out);
1091     felem_assign(y3, y_out);
1092     felem_assign(z3, z_out);
1093 }
1094
1095 /*
1096  * select_point selects the |idx|th point from a precomputation table and
1097  * copies it to out.
1098  * The pre_comp array argument should be size of |size| argument
1099  */
1100 static void select_point(const u64 idx, unsigned int size,
1101                          const felem pre_comp[][3], felem out[3])
1102 {
1103     unsigned i, j;
1104     limb *outlimbs = &out[0][0];
1105
1106     memset(out, 0, sizeof(*out) * 3);
1107     for (i = 0; i < size; i++) {
1108         const limb *inlimbs = &pre_comp[i][0][0];
1109         u64 mask = i ^ idx;
1110         mask |= mask >> 4;
1111         mask |= mask >> 2;
1112         mask |= mask >> 1;
1113         mask &= 1;
1114         mask--;
1115         for (j = 0; j < 4 * 3; j++)
1116             outlimbs[j] |= inlimbs[j] & mask;
1117     }
1118 }
1119
1120 /* get_bit returns the |i|th bit in |in| */
1121 static char get_bit(const felem_bytearray in, unsigned i)
1122 {
1123     if (i >= 224)
1124         return 0;
1125     return (in[i >> 3] >> (i & 7)) & 1;
1126 }
1127
1128 /*
1129  * Interleaved point multiplication using precomputed point multiples: The
1130  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1131  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1132  * generator, using certain (large) precomputed multiples in g_pre_comp.
1133  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1134  */
1135 static void batch_mul(felem x_out, felem y_out, felem z_out,
1136                       const felem_bytearray scalars[],
1137                       const unsigned num_points, const u8 *g_scalar,
1138                       const int mixed, const felem pre_comp[][17][3],
1139                       const felem g_pre_comp[2][16][3])
1140 {
1141     int i, skip;
1142     unsigned num;
1143     unsigned gen_mul = (g_scalar != NULL);
1144     felem nq[3], tmp[4];
1145     u64 bits;
1146     u8 sign, digit;
1147
1148     /* set nq to the point at infinity */
1149     memset(nq, 0, sizeof(nq));
1150
1151     /*
1152      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1153      * of the generator (two in each of the last 28 rounds) and additions of
1154      * other points multiples (every 5th round).
1155      */
1156     skip = 1;                   /* save two point operations in the first
1157                                  * round */
1158     for (i = (num_points ? 220 : 27); i >= 0; --i) {
1159         /* double */
1160         if (!skip)
1161             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1162
1163         /* add multiples of the generator */
1164         if (gen_mul && (i <= 27)) {
1165             /* first, look 28 bits upwards */
1166             bits = get_bit(g_scalar, i + 196) << 3;
1167             bits |= get_bit(g_scalar, i + 140) << 2;
1168             bits |= get_bit(g_scalar, i + 84) << 1;
1169             bits |= get_bit(g_scalar, i + 28);
1170             /* select the point to add, in constant time */
1171             select_point(bits, 16, g_pre_comp[1], tmp);
1172
1173             if (!skip) {
1174                 /* value 1 below is argument for "mixed" */
1175                 point_add(nq[0], nq[1], nq[2],
1176                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1177             } else {
1178                 memcpy(nq, tmp, 3 * sizeof(felem));
1179                 skip = 0;
1180             }
1181
1182             /* second, look at the current position */
1183             bits = get_bit(g_scalar, i + 168) << 3;
1184             bits |= get_bit(g_scalar, i + 112) << 2;
1185             bits |= get_bit(g_scalar, i + 56) << 1;
1186             bits |= get_bit(g_scalar, i);
1187             /* select the point to add, in constant time */
1188             select_point(bits, 16, g_pre_comp[0], tmp);
1189             point_add(nq[0], nq[1], nq[2],
1190                       nq[0], nq[1], nq[2],
1191                       1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1192         }
1193
1194         /* do other additions every 5 doublings */
1195         if (num_points && (i % 5 == 0)) {
1196             /* loop over all scalars */
1197             for (num = 0; num < num_points; ++num) {
1198                 bits = get_bit(scalars[num], i + 4) << 5;
1199                 bits |= get_bit(scalars[num], i + 3) << 4;
1200                 bits |= get_bit(scalars[num], i + 2) << 3;
1201                 bits |= get_bit(scalars[num], i + 1) << 2;
1202                 bits |= get_bit(scalars[num], i) << 1;
1203                 bits |= get_bit(scalars[num], i - 1);
1204                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1205
1206                 /* select the point to add or subtract */
1207                 select_point(digit, 17, pre_comp[num], tmp);
1208                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1209                                             * point */
1210                 copy_conditional(tmp[1], tmp[3], sign);
1211
1212                 if (!skip) {
1213                     point_add(nq[0], nq[1], nq[2],
1214                               nq[0], nq[1], nq[2],
1215                               mixed, tmp[0], tmp[1], tmp[2]);
1216                 } else {
1217                     memcpy(nq, tmp, 3 * sizeof(felem));
1218                     skip = 0;
1219                 }
1220             }
1221         }
1222     }
1223     felem_assign(x_out, nq[0]);
1224     felem_assign(y_out, nq[1]);
1225     felem_assign(z_out, nq[2]);
1226 }
1227
1228 /******************************************************************************/
1229 /*
1230  * FUNCTIONS TO MANAGE PRECOMPUTATION
1231  */
1232
1233 static NISTP224_PRE_COMP *nistp224_pre_comp_new(void)
1234 {
1235     NISTP224_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1236
1237     if (!ret) {
1238         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1239         return ret;
1240     }
1241
1242     ret->references = 1;
1243
1244     ret->lock = CRYPTO_THREAD_lock_new();
1245     if (ret->lock == NULL) {
1246         ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1247         OPENSSL_free(ret);
1248         return NULL;
1249     }
1250     return ret;
1251 }
1252
1253 NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *p)
1254 {
1255     int i;
1256     if (p != NULL)
1257         CRYPTO_UP_REF(&p->references, &i, p->lock);
1258     return p;
1259 }
1260
1261 void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *p)
1262 {
1263     int i;
1264
1265     if (p == NULL)
1266         return;
1267
1268     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1269     REF_PRINT_COUNT("EC_nistp224", x);
1270     if (i > 0)
1271         return;
1272     REF_ASSERT_ISNT(i < 0);
1273
1274     CRYPTO_THREAD_lock_free(p->lock);
1275     OPENSSL_free(p);
1276 }
1277
1278 /******************************************************************************/
1279 /*
1280  * OPENSSL EC_METHOD FUNCTIONS
1281  */
1282
1283 int ec_GFp_nistp224_group_init(EC_GROUP *group)
1284 {
1285     int ret;
1286     ret = ec_GFp_simple_group_init(group);
1287     group->a_is_minus3 = 1;
1288     return ret;
1289 }
1290
1291 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1292                                     const BIGNUM *a, const BIGNUM *b,
1293                                     BN_CTX *ctx)
1294 {
1295     int ret = 0;
1296     BN_CTX *new_ctx = NULL;
1297     BIGNUM *curve_p, *curve_a, *curve_b;
1298
1299     if (ctx == NULL)
1300         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1301             return 0;
1302     BN_CTX_start(ctx);
1303     curve_p = BN_CTX_get(ctx);
1304     curve_a = BN_CTX_get(ctx);
1305     curve_b = BN_CTX_get(ctx);
1306     if (curve_b == NULL)
1307         goto err;
1308     BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
1309     BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
1310     BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
1311     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1312         ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
1313               EC_R_WRONG_CURVE_PARAMETERS);
1314         goto err;
1315     }
1316     group->field_mod_func = BN_nist_mod_224;
1317     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1318  err:
1319     BN_CTX_end(ctx);
1320     BN_CTX_free(new_ctx);
1321     return ret;
1322 }
1323
1324 /*
1325  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1326  * (X/Z^2, Y/Z^3)
1327  */
1328 int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
1329                                                  const EC_POINT *point,
1330                                                  BIGNUM *x, BIGNUM *y,
1331                                                  BN_CTX *ctx)
1332 {
1333     felem z1, z2, x_in, y_in, x_out, y_out;
1334     widefelem tmp;
1335
1336     if (EC_POINT_is_at_infinity(group, point)) {
1337         ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1338               EC_R_POINT_AT_INFINITY);
1339         return 0;
1340     }
1341     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1342         (!BN_to_felem(z1, point->Z)))
1343         return 0;
1344     felem_inv(z2, z1);
1345     felem_square(tmp, z2);
1346     felem_reduce(z1, tmp);
1347     felem_mul(tmp, x_in, z1);
1348     felem_reduce(x_in, tmp);
1349     felem_contract(x_out, x_in);
1350     if (x != NULL) {
1351         if (!felem_to_BN(x, x_out)) {
1352             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1353                   ERR_R_BN_LIB);
1354             return 0;
1355         }
1356     }
1357     felem_mul(tmp, z1, z2);
1358     felem_reduce(z1, tmp);
1359     felem_mul(tmp, y_in, z1);
1360     felem_reduce(y_in, tmp);
1361     felem_contract(y_out, y_in);
1362     if (y != NULL) {
1363         if (!felem_to_BN(y, y_out)) {
1364             ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
1365                   ERR_R_BN_LIB);
1366             return 0;
1367         }
1368     }
1369     return 1;
1370 }
1371
1372 static void make_points_affine(size_t num, felem points[ /* num */ ][3],
1373                                felem tmp_felems[ /* num+1 */ ])
1374 {
1375     /*
1376      * Runs in constant time, unless an input is the point at infinity (which
1377      * normally shouldn't happen).
1378      */
1379     ec_GFp_nistp_points_make_affine_internal(num,
1380                                              points,
1381                                              sizeof(felem),
1382                                              tmp_felems,
1383                                              (void (*)(void *))felem_one,
1384                                              felem_is_zero_int,
1385                                              (void (*)(void *, const void *))
1386                                              felem_assign,
1387                                              (void (*)(void *, const void *))
1388                                              felem_square_reduce, (void (*)
1389                                                                    (void *,
1390                                                                     const void
1391                                                                     *,
1392                                                                     const void
1393                                                                     *))
1394                                              felem_mul_reduce,
1395                                              (void (*)(void *, const void *))
1396                                              felem_inv,
1397                                              (void (*)(void *, const void *))
1398                                              felem_contract);
1399 }
1400
1401 /*
1402  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1403  * values Result is stored in r (r can equal one of the inputs).
1404  */
1405 int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
1406                                const BIGNUM *scalar, size_t num,
1407                                const EC_POINT *points[],
1408                                const BIGNUM *scalars[], BN_CTX *ctx)
1409 {
1410     int ret = 0;
1411     int j;
1412     unsigned i;
1413     int mixed = 0;
1414     BIGNUM *x, *y, *z, *tmp_scalar;
1415     felem_bytearray g_secret;
1416     felem_bytearray *secrets = NULL;
1417     felem (*pre_comp)[17][3] = NULL;
1418     felem *tmp_felems = NULL;
1419     int num_bytes;
1420     int have_pre_comp = 0;
1421     size_t num_points = num;
1422     felem x_in, y_in, z_in, x_out, y_out, z_out;
1423     NISTP224_PRE_COMP *pre = NULL;
1424     const felem(*g_pre_comp)[16][3] = NULL;
1425     EC_POINT *generator = NULL;
1426     const EC_POINT *p = NULL;
1427     const BIGNUM *p_scalar = NULL;
1428
1429     BN_CTX_start(ctx);
1430     x = BN_CTX_get(ctx);
1431     y = BN_CTX_get(ctx);
1432     z = BN_CTX_get(ctx);
1433     tmp_scalar = BN_CTX_get(ctx);
1434     if (tmp_scalar == NULL)
1435         goto err;
1436
1437     if (scalar != NULL) {
1438         pre = group->pre_comp.nistp224;
1439         if (pre)
1440             /* we have precomputation, try to use it */
1441             g_pre_comp = (const felem(*)[16][3])pre->g_pre_comp;
1442         else
1443             /* try to use the standard precomputation */
1444             g_pre_comp = &gmul[0];
1445         generator = EC_POINT_new(group);
1446         if (generator == NULL)
1447             goto err;
1448         /* get the generator from precomputation */
1449         if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
1450             !felem_to_BN(y, g_pre_comp[0][1][1]) ||
1451             !felem_to_BN(z, g_pre_comp[0][1][2])) {
1452             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1453             goto err;
1454         }
1455         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1456                                                       generator, x, y, z,
1457                                                       ctx))
1458             goto err;
1459         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1460             /* precomputation matches generator */
1461             have_pre_comp = 1;
1462         else
1463             /*
1464              * we don't have valid precomputation: treat the generator as a
1465              * random point
1466              */
1467             num_points = num_points + 1;
1468     }
1469
1470     if (num_points > 0) {
1471         if (num_points >= 3) {
1472             /*
1473              * unless we precompute multiples for just one or two points,
1474              * converting those into affine form is time well spent
1475              */
1476             mixed = 1;
1477         }
1478         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1479         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1480         if (mixed)
1481             tmp_felems =
1482                 OPENSSL_malloc(sizeof(felem) * (num_points * 17 + 1));
1483         if ((secrets == NULL) || (pre_comp == NULL)
1484             || (mixed && (tmp_felems == NULL))) {
1485             ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1486             goto err;
1487         }
1488
1489         /*
1490          * we treat NULL scalars as 0, and NULL points as points at infinity,
1491          * i.e., they contribute nothing to the linear combination
1492          */
1493         for (i = 0; i < num_points; ++i) {
1494             if (i == num) {
1495                 /* the generator */
1496                 p = EC_GROUP_get0_generator(group);
1497                 p_scalar = scalar;
1498             } else {
1499                 /* the i^th point */
1500                 p = points[i];
1501                 p_scalar = scalars[i];
1502             }
1503             if ((p_scalar != NULL) && (p != NULL)) {
1504                 /* reduce scalar to 0 <= scalar < 2^224 */
1505                 if ((BN_num_bits(p_scalar) > 224)
1506                     || (BN_is_negative(p_scalar))) {
1507                     /*
1508                      * this is an unusual input, and we don't guarantee
1509                      * constant-timeness
1510                      */
1511                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1512                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1513                         goto err;
1514                     }
1515                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1516                                                secrets[i], sizeof(secrets[i]));
1517                 } else {
1518                     num_bytes = BN_bn2lebinpad(p_scalar,
1519                                                secrets[i], sizeof(secrets[i]));
1520                 }
1521                 if (num_bytes < 0) {
1522                     ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1523                     goto err;
1524                 }
1525                 /* precompute multiples */
1526                 if ((!BN_to_felem(x_out, p->X)) ||
1527                     (!BN_to_felem(y_out, p->Y)) ||
1528                     (!BN_to_felem(z_out, p->Z)))
1529                     goto err;
1530                 felem_assign(pre_comp[i][1][0], x_out);
1531                 felem_assign(pre_comp[i][1][1], y_out);
1532                 felem_assign(pre_comp[i][1][2], z_out);
1533                 for (j = 2; j <= 16; ++j) {
1534                     if (j & 1) {
1535                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1536                                   pre_comp[i][j][2], pre_comp[i][1][0],
1537                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1538                                   pre_comp[i][j - 1][0],
1539                                   pre_comp[i][j - 1][1],
1540                                   pre_comp[i][j - 1][2]);
1541                     } else {
1542                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1543                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1544                                      pre_comp[i][j / 2][1],
1545                                      pre_comp[i][j / 2][2]);
1546                     }
1547                 }
1548             }
1549         }
1550         if (mixed)
1551             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1552     }
1553
1554     /* the scalar for the generator */
1555     if ((scalar != NULL) && (have_pre_comp)) {
1556         memset(g_secret, 0, sizeof(g_secret));
1557         /* reduce scalar to 0 <= scalar < 2^224 */
1558         if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) {
1559             /*
1560              * this is an unusual input, and we don't guarantee
1561              * constant-timeness
1562              */
1563             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1564                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1565                 goto err;
1566             }
1567             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1568         } else {
1569             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1570         }
1571         /* do the multiplication with generator precomputation */
1572         batch_mul(x_out, y_out, z_out,
1573                   (const felem_bytearray(*))secrets, num_points,
1574                   g_secret,
1575                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
1576     } else {
1577         /* do the multiplication without generator precomputation */
1578         batch_mul(x_out, y_out, z_out,
1579                   (const felem_bytearray(*))secrets, num_points,
1580                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1581     }
1582     /* reduce the output to its unique minimal representation */
1583     felem_contract(x_in, x_out);
1584     felem_contract(y_in, y_out);
1585     felem_contract(z_in, z_out);
1586     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1587         (!felem_to_BN(z, z_in))) {
1588         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
1589         goto err;
1590     }
1591     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1592
1593  err:
1594     BN_CTX_end(ctx);
1595     EC_POINT_free(generator);
1596     OPENSSL_free(secrets);
1597     OPENSSL_free(pre_comp);
1598     OPENSSL_free(tmp_felems);
1599     return ret;
1600 }
1601
1602 int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1603 {
1604     int ret = 0;
1605     NISTP224_PRE_COMP *pre = NULL;
1606     int i, j;
1607     BN_CTX *new_ctx = NULL;
1608     BIGNUM *x, *y;
1609     EC_POINT *generator = NULL;
1610     felem tmp_felems[32];
1611
1612     /* throw away old precomputation */
1613     EC_pre_comp_free(group);
1614     if (ctx == NULL)
1615         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1616             return 0;
1617     BN_CTX_start(ctx);
1618     x = BN_CTX_get(ctx);
1619     y = BN_CTX_get(ctx);
1620     if (y == NULL)
1621         goto err;
1622     /* get the generator */
1623     if (group->generator == NULL)
1624         goto err;
1625     generator = EC_POINT_new(group);
1626     if (generator == NULL)
1627         goto err;
1628     BN_bin2bn(nistp224_curve_params[3], sizeof(felem_bytearray), x);
1629     BN_bin2bn(nistp224_curve_params[4], sizeof(felem_bytearray), y);
1630     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1631         goto err;
1632     if ((pre = nistp224_pre_comp_new()) == NULL)
1633         goto err;
1634     /*
1635      * if the generator is the standard one, use built-in precomputation
1636      */
1637     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1638         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1639         goto done;
1640     }
1641     if ((!BN_to_felem(pre->g_pre_comp[0][1][0], group->generator->X)) ||
1642         (!BN_to_felem(pre->g_pre_comp[0][1][1], group->generator->Y)) ||
1643         (!BN_to_felem(pre->g_pre_comp[0][1][2], group->generator->Z)))
1644         goto err;
1645     /*
1646      * compute 2^56*G, 2^112*G, 2^168*G for the first table, 2^28*G, 2^84*G,
1647      * 2^140*G, 2^196*G for the second one
1648      */
1649     for (i = 1; i <= 8; i <<= 1) {
1650         point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1651                      pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
1652                      pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
1653         for (j = 0; j < 27; ++j) {
1654             point_double(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
1655                          pre->g_pre_comp[1][i][2], pre->g_pre_comp[1][i][0],
1656                          pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1657         }
1658         if (i == 8)
1659             break;
1660         point_double(pre->g_pre_comp[0][2 * i][0],
1661                      pre->g_pre_comp[0][2 * i][1],
1662                      pre->g_pre_comp[0][2 * i][2], pre->g_pre_comp[1][i][0],
1663                      pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
1664         for (j = 0; j < 27; ++j) {
1665             point_double(pre->g_pre_comp[0][2 * i][0],
1666                          pre->g_pre_comp[0][2 * i][1],
1667                          pre->g_pre_comp[0][2 * i][2],
1668                          pre->g_pre_comp[0][2 * i][0],
1669                          pre->g_pre_comp[0][2 * i][1],
1670                          pre->g_pre_comp[0][2 * i][2]);
1671         }
1672     }
1673     for (i = 0; i < 2; i++) {
1674         /* g_pre_comp[i][0] is the point at infinity */
1675         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
1676         /* the remaining multiples */
1677         /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
1678         point_add(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
1679                   pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
1680                   pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
1681                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1682                   pre->g_pre_comp[i][2][2]);
1683         /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
1684         point_add(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
1685                   pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
1686                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1687                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1688                   pre->g_pre_comp[i][2][2]);
1689         /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
1690         point_add(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
1691                   pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
1692                   pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
1693                   0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
1694                   pre->g_pre_comp[i][4][2]);
1695         /*
1696          * 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G
1697          */
1698         point_add(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
1699                   pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
1700                   pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
1701                   0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
1702                   pre->g_pre_comp[i][2][2]);
1703         for (j = 1; j < 8; ++j) {
1704             /* odd multiples: add G resp. 2^28*G */
1705             point_add(pre->g_pre_comp[i][2 * j + 1][0],
1706                       pre->g_pre_comp[i][2 * j + 1][1],
1707                       pre->g_pre_comp[i][2 * j + 1][2],
1708                       pre->g_pre_comp[i][2 * j][0],
1709                       pre->g_pre_comp[i][2 * j][1],
1710                       pre->g_pre_comp[i][2 * j][2], 0,
1711                       pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
1712                       pre->g_pre_comp[i][1][2]);
1713         }
1714     }
1715     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
1716
1717  done:
1718     SETPRECOMP(group, nistp224, pre);
1719     pre = NULL;
1720     ret = 1;
1721  err:
1722     BN_CTX_end(ctx);
1723     EC_POINT_free(generator);
1724     BN_CTX_free(new_ctx);
1725     EC_nistp224_pre_comp_free(pre);
1726     return ret;
1727 }
1728
1729 int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
1730 {
1731     return HAVEPRECOMP(group, nistp224);
1732 }
1733
1734 #endif