contrib/bearssl/src/int/i31_moddiv.c

   1 /*
   2  * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * In this file, we handle big integers with a custom format, i.e.
  29  * without the usual one-word header. Value is split into 31-bit words,
  30  * each stored in a 32-bit slot (top bit is zero) in little-endian
  31  * order. The length (in words) is provided explicitly. In some cases,
  32  * the value can be negative (using two's complement representation). In
  33  * some cases, the top word is allowed to have a 32th bit.
  34  */
  35
  36 /*
  37  * Negate big integer conditionally. The value consists of 'len' words,
  38  * with 31 bits in each word (the top bit of each word should be 0,
  39  * except possibly for the last word). If 'ctl' is 1, the negation is
  40  * computed; otherwise, if 'ctl' is 0, then the value is unchanged.
  41  */
  42 static void
  43 cond_negate(uint32_t *a, size_t len, uint32_t ctl)
  44 {
  45         size_t k;
  46         uint32_t cc, xm;
  47
  48         cc = ctl;
  49         xm = -ctl >> 1;
  50         for (k = 0; k < len; k ++) {
  51                 uint32_t aw;
  52
  53                 aw = a[k];
  54                 aw = (aw ^ xm) + cc;
  55                 a[k] = aw & 0x7FFFFFFF;
  56                 cc = aw >> 31;
  57         }
  58 }
  59
  60 /*
  61  * Finish modular reduction. Rules on input parameters:
  62  *
  63  *   if neg = 1, then -m <= a < 0
  64  *   if neg = 0, then 0 <= a < 2*m
  65  *
  66  * If neg = 0, then the top word of a[] may use 32 bits.
  67  *
  68  * Also, modulus m must be odd.
  69  */
  70 static void
  71 finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg)
  72 {
  73         size_t k;
  74         uint32_t cc, xm, ym;
  75
  76         /*
  77          * First pass: compare a (assumed nonnegative) with m.
  78          * Note that if the final word uses the top extra bit, then
  79          * subtracting m must yield a value less than 2^31, since we
  80          * assumed that a < 2*m.
  81          */
  82         cc = 0;
  83         for (k = 0; k < len; k ++) {
  84                 uint32_t aw, mw;
  85
  86                 aw = a[k];
  87                 mw = m[k];
  88                 cc = (aw - mw - cc) >> 31;
  89         }
  90
  91         /*
  92          * At this point:
  93          *   if neg = 1, then we must add m (regardless of cc)
  94          *   if neg = 0 and cc = 0, then we must subtract m
  95          *   if neg = 0 and cc = 1, then we must do nothing
  96          */
  97         xm = -neg >> 1;
  98         ym = -(neg | (1 - cc));
  99         cc = neg;
 100         for (k = 0; k < len; k ++) {
 101                 uint32_t aw, mw;
 102
 103                 aw = a[k];
 104                 mw = (m[k] ^ xm) & ym;
 105                 aw = aw - mw - cc;
 106                 a[k] = aw & 0x7FFFFFFF;
 107                 cc = aw >> 31;
 108         }
 109 }
 110
 111 /*
 112  * Compute:
 113  *   a <- (a*pa+b*pb)/(2^31)
 114  *   b <- (a*qa+b*qb)/(2^31)
 115  * The division is assumed to be exact (i.e. the low word is dropped).
 116  * If the final a is negative, then it is negated. Similarly for b.
 117  * Returned value is the combination of two bits:
 118  *   bit 0: 1 if a had to be negated, 0 otherwise
 119  *   bit 1: 1 if b had to be negated, 0 otherwise
 120  *
 121  * Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
 122  * Source integers a and b must be nonnegative; top word is not allowed
 123  * to contain an extra 32th bit.
 124  */
 125 static uint32_t
 126 co_reduce(uint32_t *a, uint32_t *b, size_t len,
 127         int64_t pa, int64_t pb, int64_t qa, int64_t qb)
 128 {
 129         size_t k;
 130         int64_t cca, ccb;
 131         uint32_t nega, negb;
 132
 133         cca = 0;
 134         ccb = 0;
 135         for (k = 0; k < len; k ++) {
 136                 uint32_t wa, wb;
 137                 uint64_t za, zb;
 138                 uint64_t tta, ttb;
 139
 140                 /*
 141                  * Since:
 142                  *   |pa| <= 2^31
 143                  *   |pb| <= 2^31
 144                  *   0 <= wa <= 2^31 - 1
 145                  *   0 <= wb <= 2^31 - 1
 146                  *   |cca| <= 2^32 - 1
 147                  * Then:
 148                  *   |za| <= (2^31-1)*(2^32) + (2^32-1) = 2^63 - 1
 149                  *
 150                  * Thus, the new value of cca is such that |cca| <= 2^32 - 1.
 151                  * The same applies to ccb.
 152                  */
 153                 wa = a[k];
 154                 wb = b[k];
 155                 za = wa * (uint64_t)pa + wb * (uint64_t)pb + (uint64_t)cca;
 156                 zb = wa * (uint64_t)qa + wb * (uint64_t)qb + (uint64_t)ccb;
 157                 if (k > 0) {
 158                         a[k - 1] = za & 0x7FFFFFFF;
 159                         b[k - 1] = zb & 0x7FFFFFFF;
 160                 }
 161
 162                 /*
 163                  * For the new values of cca and ccb, we need a signed
 164                  * right-shift; since, in C, right-shifting a signed
 165                  * negative value is implementation-defined, we use a
 166                  * custom portable sign extension expression.
 167                  */
 168 #define M   ((uint64_t)1 << 32)
 169                 tta = za >> 31;
 170                 ttb = zb >> 31;
 171                 tta = (tta ^ M) - M;
 172                 ttb = (ttb ^ M) - M;
 173                 cca = *(int64_t *)&tta;
 174                 ccb = *(int64_t *)&ttb;
 175 #undef M
 176         }
 177         a[len - 1] = (uint32_t)cca;
 178         b[len - 1] = (uint32_t)ccb;
 179
 180         nega = (uint32_t)((uint64_t)cca >> 63);
 181         negb = (uint32_t)((uint64_t)ccb >> 63);
 182         cond_negate(a, len, nega);
 183         cond_negate(b, len, negb);
 184         return nega | (negb << 1);
 185 }
 186
 187 /*
 188  * Compute:
 189  *   a <- (a*pa+b*pb)/(2^31) mod m
 190  *   b <- (a*qa+b*qb)/(2^31) mod m
 191  *
 192  * m0i is equal to -1/m[0] mod 2^31.
 193  *
 194  * Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
 195  * Source integers a and b must be nonnegative; top word is not allowed
 196  * to contain an extra 32th bit.
 197  */
 198 static void
 199 co_reduce_mod(uint32_t *a, uint32_t *b, size_t len,
 200         int64_t pa, int64_t pb, int64_t qa, int64_t qb,
 201         const uint32_t *m, uint32_t m0i)
 202 {
 203         size_t k;
 204         int64_t cca, ccb;
 205         uint32_t fa, fb;
 206
 207         cca = 0;
 208         ccb = 0;
 209         fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFFFFFF;
 210         fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFFFFFF;
 211         for (k = 0; k < len; k ++) {
 212                 uint32_t wa, wb;
 213                 uint64_t za, zb;
 214                 uint64_t tta, ttb;
 215
 216                 /*
 217                  * In this loop, carries 'cca' and 'ccb' always fit on
 218                  * 33 bits (in absolute value).
 219                  */
 220                 wa = a[k];
 221                 wb = b[k];
 222                 za = wa * (uint64_t)pa + wb * (uint64_t)pb
 223                         + m[k] * (uint64_t)fa + (uint64_t)cca;
 224                 zb = wa * (uint64_t)qa + wb * (uint64_t)qb
 225                         + m[k] * (uint64_t)fb + (uint64_t)ccb;
 226                 if (k > 0) {
 227                         a[k - 1] = (uint32_t)za & 0x7FFFFFFF;
 228                         b[k - 1] = (uint32_t)zb & 0x7FFFFFFF;
 229                 }
 230
 231 #define M   ((uint64_t)1 << 32)
 232                 tta = za >> 31;
 233                 ttb = zb >> 31;
 234                 tta = (tta ^ M) - M;
 235                 ttb = (ttb ^ M) - M;
 236                 cca = *(int64_t *)&tta;
 237                 ccb = *(int64_t *)&ttb;
 238 #undef M
 239         }
 240         a[len - 1] = (uint32_t)cca;
 241         b[len - 1] = (uint32_t)ccb;
 242
 243         /*
 244          * At this point:
 245          *   -m <= a < 2*m
 246          *   -m <= b < 2*m
 247          * (this is a case of Montgomery reduction)
 248          * The top word of 'a' and 'b' may have a 32-th bit set.
 249          * We may have to add or subtract the modulus.
 250          */
 251         finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
 252         finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
 253 }
 254
 255 /* see inner.h */
 256 uint32_t
 257 br_i31_moddiv(uint32_t *x, const uint32_t *y, const uint32_t *m, uint32_t m0i,
 258         uint32_t *t)
 259 {
 260         /*
 261          * Algorithm is an extended binary GCD. We maintain four values
 262          * a, b, u and v, with the following invariants:
 263          *
 264          *   a * x = y * u mod m
 265          *   b * x = y * v mod m
 266          *
 267          * Starting values are:
 268          *
 269          *   a = y
 270          *   b = m
 271          *   u = x
 272          *   v = 0
 273          *
 274          * The formal definition of the algorithm is a sequence of steps:
 275          *
 276          *   - If a is even, then a <- a/2 and u <- u/2 mod m.
 277          *   - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
 278          *   - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
 279          *   - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
 280          *
 281          * Algorithm stops when a = b. At that point, they both are equal
 282          * to GCD(y,m); the modular division succeeds if that value is 1.
 283          * The result of the modular division is then u (or v: both are
 284          * equal at that point).
 285          *
 286          * Each step makes either a or b shrink by at least one bit; hence,
 287          * if m has bit length k bits, then 2k-2 steps are sufficient.
 288          *
 289          *
 290          * Though complexity is quadratic in the size of m, the bit-by-bit
 291          * processing is not very efficient. We can speed up processing by
 292          * remarking that the decisions are taken based only on observation
 293          * of the top and low bits of a and b.
 294          *
 295          * In the loop below, at each iteration, we use the two top words
 296          * of a and b, and the low words of a and b, to compute reduction
 297          * parameters pa, pb, qa and qb such that the new values for a
 298          * and b are:
 299          *
 300          *   a' = (a*pa + b*pb) / (2^31)
 301          *   b' = (a*qa + b*qb) / (2^31)
 302          *
 303          * the division being exact.
 304          *
 305          * Since the choices are based on the top words, they may be slightly
 306          * off, requiring an optional correction: if a' < 0, then we replace
 307          * pa with -pa, and pb with -pb. The total length of a and b is
 308          * thus reduced by at least 30 bits at each iteration.
 309          *
 310          * The stopping conditions are still the same, though: when a
 311          * and b become equal, they must be both odd (since m is odd,
 312          * the GCD cannot be even), therefore the next operation is a
 313          * subtraction, and one of the values becomes 0. At that point,
 314          * nothing else happens, i.e. one value is stuck at 0, and the
 315          * other one is the GCD.
 316          */
 317         size_t len, k;
 318         uint32_t *a, *b, *u, *v;
 319         uint32_t num, r;
 320
 321         len = (m[0] + 31) >> 5;
 322         a = t;
 323         b = a + len;
 324         u = x + 1;
 325         v = b + len;
 326         memcpy(a, y + 1, len * sizeof *y);
 327         memcpy(b, m + 1, len * sizeof *m);
 328         memset(v, 0, len * sizeof *v);
 329
 330         /*
 331          * Loop below ensures that a and b are reduced by some bits each,
 332          * for a total of at least 30 bits.
 333          */
 334         for (num = ((m[0] - (m[0] >> 5)) << 1) + 30; num >= 30; num -= 30) {
 335                 size_t j;
 336                 uint32_t c0, c1;
 337                 uint32_t a0, a1, b0, b1;
 338                 uint64_t a_hi, b_hi;
 339                 uint32_t a_lo, b_lo;
 340                 int64_t pa, pb, qa, qb;
 341                 int i;
 342
 343                 /*
 344                  * Extract top words of a and b. If j is the highest
 345                  * index >= 1 such that a[j] != 0 or b[j] != 0, then we want
 346                  * (a[j] << 31) + a[j - 1], and (b[j] << 31) + b[j - 1].
 347                  * If a and b are down to one word each, then we use a[0]
 348                  * and b[0].
 349                  */
 350                 c0 = (uint32_t)-1;
 351                 c1 = (uint32_t)-1;
 352                 a0 = 0;
 353                 a1 = 0;
 354                 b0 = 0;
 355                 b1 = 0;
 356                 j = len;
 357                 while (j -- > 0) {
 358                         uint32_t aw, bw;
 359
 360                         aw = a[j];
 361                         bw = b[j];
 362                         a0 ^= (a0 ^ aw) & c0;
 363                         a1 ^= (a1 ^ aw) & c1;
 364                         b0 ^= (b0 ^ bw) & c0;
 365                         b1 ^= (b1 ^ bw) & c1;
 366                         c1 = c0;
 367                         c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
 368                 }
 369
 370                 /*
 371                  * If c1 = 0, then we grabbed two words for a and b.
 372                  * If c1 != 0 but c0 = 0, then we grabbed one word. It
 373                  * is not possible that c1 != 0 and c0 != 0, because that
 374                  * would mean that both integers are zero.
 375                  */
 376                 a1 |= a0 & c1;
 377                 a0 &= ~c1;
 378                 b1 |= b0 & c1;
 379                 b0 &= ~c1;
 380                 a_hi = ((uint64_t)a0 << 31) + a1;
 381                 b_hi = ((uint64_t)b0 << 31) + b1;
 382                 a_lo = a[0];
 383                 b_lo = b[0];
 384
 385                 /*
 386                  * Compute reduction factors:
 387                  *
 388                  *   a' = a*pa + b*pb
 389                  *   b' = a*qa + b*qb
 390                  *
 391                  * such that a' and b' are both multiple of 2^31, but are
 392                  * only marginally larger than a and b.
 393                  */
 394                 pa = 1;
 395                 pb = 0;
 396                 qa = 0;
 397                 qb = 1;
 398                 for (i = 0; i < 31; i ++) {
 399                         /*
 400                          * At each iteration:
 401                          *
 402                          *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
 403                          *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
 404                          *   a <- a/2 if: a is even
 405                          *   b <- b/2 if: a is odd, b is even
 406                          *
 407                          * We multiply a_lo and b_lo by 2 at each
 408                          * iteration, thus a division by 2 really is a
 409                          * non-multiplication by 2.
 410                          */
 411                         uint32_t r, oa, ob, cAB, cBA, cA;
 412                         uint64_t rz;
 413
 414                         /*
 415                          * r = GT(a_hi, b_hi)
 416                          * But the GT() function works on uint32_t operands,
 417                          * so we inline a 64-bit version here.
 418                          */
 419                         rz = b_hi - a_hi;
 420                         r = (uint32_t)((rz ^ ((a_hi ^ b_hi)
 421                                 & (a_hi ^ rz))) >> 63);
 422
 423                         /*
 424                          * cAB = 1 if b must be subtracted from a
 425                          * cBA = 1 if a must be subtracted from b
 426                          * cA = 1 if a is divided by 2, 0 otherwise
 427                          *
 428                          * Rules:
 429                          *
 430                          *   cAB and cBA cannot be both 1.
 431                          *   if a is not divided by 2, b is.
 432                          */
 433                         oa = (a_lo >> i) & 1;
 434                         ob = (b_lo >> i) & 1;
 435                         cAB = oa & ob & r;
 436                         cBA = oa & ob & NOT(r);
 437                         cA = cAB | NOT(oa);
 438
 439                         /*
 440                          * Conditional subtractions.
 441                          */
 442                         a_lo -= b_lo & -cAB;
 443                         a_hi -= b_hi & -(uint64_t)cAB;
 444                         pa -= qa & -(int64_t)cAB;
 445                         pb -= qb & -(int64_t)cAB;
 446                         b_lo -= a_lo & -cBA;
 447                         b_hi -= a_hi & -(uint64_t)cBA;
 448                         qa -= pa & -(int64_t)cBA;
 449                         qb -= pb & -(int64_t)cBA;
 450
 451                         /*
 452                          * Shifting.
 453                          */
 454                         a_lo += a_lo & (cA - 1);
 455                         pa += pa & ((int64_t)cA - 1);
 456                         pb += pb & ((int64_t)cA - 1);
 457                         a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
 458                         b_lo += b_lo & -cA;
 459                         qa += qa & -(int64_t)cA;
 460                         qb += qb & -(int64_t)cA;
 461                         b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
 462                 }
 463
 464                 /*
 465                  * Replace a and b with new values a' and b'.
 466                  */
 467                 r = co_reduce(a, b, len, pa, pb, qa, qb);
 468                 pa -= pa * ((r & 1) << 1);
 469                 pb -= pb * ((r & 1) << 1);
 470                 qa -= qa * (r & 2);
 471                 qb -= qb * (r & 2);
 472                 co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);
 473         }
 474
 475         /*
 476          * Now one of the arrays should be 0, and the other contains
 477          * the GCD. If a is 0, then u is 0 as well, and v contains
 478          * the division result.
 479          * Result is correct if and only if GCD is 1.
 480          */
 481         r = (a[0] | b[0]) ^ 1;
 482         u[0] |= v[0];
 483         for (k = 1; k < len; k ++) {
 484                 r |= a[k] | b[k];
 485                 u[k] |= v[k];
 486         }
 487         return EQ0(r);
 488 }