contrib/bearssl/src/symcipher/des_ct.c

   1 /*
   2  * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * During key schedule, we need to apply bit extraction PC-2 then permute
  29  * things into our bitslice representation. PC-2 extracts 48 bits out
  30  * of two 28-bit words (kl and kr), and we store these bits into two
  31  * 32-bit words sk0 and sk1.
  32  *
  33  *  -- bit 16+x of sk0 comes from bit QL0[x] of kl
  34  *  -- bit x of sk0 comes from bit QR0[x] of kr
  35  *  -- bit 16+x of sk1 comes from bit QL1[x] of kl
  36  *  -- bit x of sk1 comes from bit QR1[x] of kr
  37  */
  38
  39 static const unsigned char QL0[] = {
  40         17,  4, 27, 23, 13, 22,  7, 18,
  41         16, 24,  2, 20,  1,  8, 15, 26
  42 };
  43
  44 static const unsigned char QR0[] = {
  45         25, 19,  9,  1,  5, 11, 23,  8,
  46         17,  0, 22,  3,  6, 20, 27, 24
  47 };
  48
  49 static const unsigned char QL1[] = {
  50         28, 28, 14, 11, 28, 28, 25,  0,
  51         28, 28,  5,  9, 28, 28, 12, 21
  52 };
  53
  54 static const unsigned char QR1[] = {
  55         28, 28, 15,  4, 28, 28, 26, 16,
  56         28, 28, 12,  7, 28, 28, 10, 14
  57 };
  58
  59 /*
  60  * 32-bit rotation. The C compiler is supposed to recognize it as a
  61  * rotation and use the local architecture rotation opcode (if available).
  62  */
  63 static inline uint32_t
  64 rotl(uint32_t x, int n)
  65 {
  66         return (x << n) | (x >> (32 - n));
  67 }
  68
  69 /*
  70  * Compute key schedule for 8 key bytes (produces 32 subkey words).
  71  */
  72 static void
  73 keysched_unit(uint32_t *skey, const void *key)
  74 {
  75         int i;
  76
  77         br_des_keysched_unit(skey, key);
  78
  79         /*
  80          * Apply PC-2 + bitslicing.
  81          */
  82         for (i = 0; i < 16; i ++) {
  83                 uint32_t kl, kr, sk0, sk1;
  84                 int j;
  85
  86                 kl = skey[(i << 1) + 0];
  87                 kr = skey[(i << 1) + 1];
  88                 sk0 = 0;
  89                 sk1 = 0;
  90                 for (j = 0; j < 16; j ++) {
  91                         sk0 <<= 1;
  92                         sk1 <<= 1;
  93                         sk0 |= ((kl >> QL0[j]) & (uint32_t)1) << 16;
  94                         sk0 |= (kr >> QR0[j]) & (uint32_t)1;
  95                         sk1 |= ((kl >> QL1[j]) & (uint32_t)1) << 16;
  96                         sk1 |= (kr >> QR1[j]) & (uint32_t)1;
  97                 }
  98
  99                 skey[(i << 1) + 0] = sk0;
 100                 skey[(i << 1) + 1] = sk1;
 101         }
 102
 103 #if 0
 104                 /*
 105                  * Speed-optimized version for PC-2 + bitslicing.
 106                  * (Unused. Kept for reference only.)
 107                  */
 108                 sk0 = kl & (uint32_t)0x00100000;
 109                 sk0 |= (kl & (uint32_t)0x08008000) << 2;
 110                 sk0 |= (kl & (uint32_t)0x00400000) << 4;
 111                 sk0 |= (kl & (uint32_t)0x00800000) << 5;
 112                 sk0 |= (kl & (uint32_t)0x00040000) << 6;
 113                 sk0 |= (kl & (uint32_t)0x00010000) << 7;
 114                 sk0 |= (kl & (uint32_t)0x00000100) << 10;
 115                 sk0 |= (kl & (uint32_t)0x00022000) << 14;
 116                 sk0 |= (kl & (uint32_t)0x00000082) << 18;
 117                 sk0 |= (kl & (uint32_t)0x00000004) << 19;
 118                 sk0 |= (kl & (uint32_t)0x04000000) >> 10;
 119                 sk0 |= (kl & (uint32_t)0x00000010) << 26;
 120                 sk0 |= (kl & (uint32_t)0x01000000) >> 2;
 121
 122                 sk0 |= kr & (uint32_t)0x00000100;
 123                 sk0 |= (kr & (uint32_t)0x00000008) << 1;
 124                 sk0 |= (kr & (uint32_t)0x00000200) << 4;
 125                 sk0 |= rotl(kr & (uint32_t)0x08000021, 6);
 126                 sk0 |= (kr & (uint32_t)0x01000000) >> 24;
 127                 sk0 |= (kr & (uint32_t)0x00000002) << 11;
 128                 sk0 |= (kr & (uint32_t)0x00100000) >> 18;
 129                 sk0 |= (kr & (uint32_t)0x00400000) >> 17;
 130                 sk0 |= (kr & (uint32_t)0x00800000) >> 14;
 131                 sk0 |= (kr & (uint32_t)0x02020000) >> 10;
 132                 sk0 |= (kr & (uint32_t)0x00080000) >> 5;
 133                 sk0 |= (kr & (uint32_t)0x00000040) >> 3;
 134                 sk0 |= (kr & (uint32_t)0x00000800) >> 1;
 135
 136                 sk1 = kl & (uint32_t)0x02000000;
 137                 sk1 |= (kl & (uint32_t)0x00001000) << 5;
 138                 sk1 |= (kl & (uint32_t)0x00000200) << 11;
 139                 sk1 |= (kl & (uint32_t)0x00004000) << 15;
 140                 sk1 |= (kl & (uint32_t)0x00000020) << 16;
 141                 sk1 |= (kl & (uint32_t)0x00000800) << 17;
 142                 sk1 |= (kl & (uint32_t)0x00000001) << 24;
 143                 sk1 |= (kl & (uint32_t)0x00200000) >> 5;
 144
 145                 sk1 |= (kr & (uint32_t)0x00000010) << 8;
 146                 sk1 |= (kr & (uint32_t)0x04000000) >> 17;
 147                 sk1 |= (kr & (uint32_t)0x00004000) >> 14;
 148                 sk1 |= (kr & (uint32_t)0x00000400) >> 9;
 149                 sk1 |= (kr & (uint32_t)0x00010000) >> 8;
 150                 sk1 |= (kr & (uint32_t)0x00001000) >> 7;
 151                 sk1 |= (kr & (uint32_t)0x00000080) >> 3;
 152                 sk1 |= (kr & (uint32_t)0x00008000) >> 2;
 153 #endif
 154 }
 155
 156 /* see inner.h */
 157 unsigned
 158 br_des_ct_keysched(uint32_t *skey, const void *key, size_t key_len)
 159 {
 160         switch (key_len) {
 161         case 8:
 162                 keysched_unit(skey, key);
 163                 return 1;
 164         case 16:
 165                 keysched_unit(skey, key);
 166                 keysched_unit(skey + 32, (const unsigned char *)key + 8);
 167                 br_des_rev_skey(skey + 32);
 168                 memcpy(skey + 64, skey, 32 * sizeof *skey);
 169                 return 3;
 170         default:
 171                 keysched_unit(skey, key);
 172                 keysched_unit(skey + 32, (const unsigned char *)key + 8);
 173                 br_des_rev_skey(skey + 32);
 174                 keysched_unit(skey + 64, (const unsigned char *)key + 16);
 175                 return 3;
 176         }
 177 }
 178
 179 /*
 180  * DES confusion function. This function performs expansion E (32 to
 181  * 48 bits), XOR with subkey, S-boxes, and permutation P.
 182  */
 183 static inline uint32_t
 184 Fconf(uint32_t r0, const uint32_t *sk)
 185 {
 186         /*
 187          * Each 6->4 S-box is virtually turned into four 6->1 boxes; we
 188          * thus end up with 32 boxes that we call "T-boxes" here. We will
 189          * evaluate them with bitslice code.
 190          *
 191          * Each T-box is a circuit of multiplexers (sort of) and thus
 192          * takes 70 inputs: the 6 actual T-box inputs, and 64 constants
 193          * that describe the T-box output for all combinations of the
 194          * 6 inputs. With this model, all T-boxes are identical (with
 195          * distinct inputs) and thus can be executed in parallel with
 196          * bitslice code.
 197          *
 198          * T-boxes are numbered from 0 to 31, in least-to-most
 199          * significant order. Thus, S-box S1 corresponds to T-boxes 31,
 200          * 30, 29 and 28, in that order. T-box 'n' is computed with the
 201          * bits at rank 'n' in the 32-bit words.
 202          *
 203          * Words x0 to x5 contain the T-box inputs 0 to 5.
 204          */
 205         uint32_t x0, x1, x2, x3, x4, x5, z0;
 206         uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
 207         uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
 208         uint32_t y20, y21, y22, y23, y24, y25, y26, y27, y28, y29;
 209         uint32_t y30;
 210
 211         /*
 212          * Spread input bits over the 6 input words x*.
 213          */
 214         x1 = r0 & (uint32_t)0x11111111;
 215         x2 = (r0 >> 1) & (uint32_t)0x11111111;
 216         x3 = (r0 >> 2) & (uint32_t)0x11111111;
 217         x4 = (r0 >> 3) & (uint32_t)0x11111111;
 218         x1 = (x1 << 4) - x1;
 219         x2 = (x2 << 4) - x2;
 220         x3 = (x3 << 4) - x3;
 221         x4 = (x4 << 4) - x4;
 222         x0 = (x4 << 4) | (x4 >> 28);
 223         x5 = (x1 >> 4) | (x1 << 28);
 224
 225         /*
 226          * XOR with the subkey for this round.
 227          */
 228         x0 ^= sk[0];
 229         x1 ^= sk[1];
 230         x2 ^= sk[2];
 231         x3 ^= sk[3];
 232         x4 ^= sk[4];
 233         x5 ^= sk[5];
 234
 235         /*
 236          * The T-boxes are done in parallel, since they all use a
 237          * "tree of multiplexer". We use "fake multiplexers":
 238          *
 239          *   y = a ^ (x & b)
 240          *
 241          * computes y as either 'a' (if x == 0) or 'a ^ b' (if x == 1).
 242          */
 243         y0 = (uint32_t)0xEFA72C4D ^ (x0 & (uint32_t)0xEC7AC69C);
 244         y1 = (uint32_t)0xAEAAEDFF ^ (x0 & (uint32_t)0x500FB821);
 245         y2 = (uint32_t)0x37396665 ^ (x0 & (uint32_t)0x40EFA809);
 246         y3 = (uint32_t)0x68D7B833 ^ (x0 & (uint32_t)0xA5EC0B28);
 247         y4 = (uint32_t)0xC9C755BB ^ (x0 & (uint32_t)0x252CF820);
 248         y5 = (uint32_t)0x73FC3606 ^ (x0 & (uint32_t)0x40205801);
 249         y6 = (uint32_t)0xA2A0A918 ^ (x0 & (uint32_t)0xE220F929);
 250         y7 = (uint32_t)0x8222BD90 ^ (x0 & (uint32_t)0x44A3F9E1);
 251         y8 = (uint32_t)0xD6B6AC77 ^ (x0 & (uint32_t)0x794F104A);
 252         y9 = (uint32_t)0x3069300C ^ (x0 & (uint32_t)0x026F320B);
 253         y10 = (uint32_t)0x6CE0D5CC ^ (x0 & (uint32_t)0x7640B01A);
 254         y11 = (uint32_t)0x59A9A22D ^ (x0 & (uint32_t)0x238F1572);
 255         y12 = (uint32_t)0xAC6D0BD4 ^ (x0 & (uint32_t)0x7A63C083);
 256         y13 = (uint32_t)0x21C83200 ^ (x0 & (uint32_t)0x11CCA000);
 257         y14 = (uint32_t)0xA0E62188 ^ (x0 & (uint32_t)0x202F69AA);
 258         /* y15 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
 259         y16 = (uint32_t)0xAF7D655A ^ (x0 & (uint32_t)0x51B33BE9);
 260         y17 = (uint32_t)0xF0168AA3 ^ (x0 & (uint32_t)0x3B0FE8AE);
 261         y18 = (uint32_t)0x90AA30C6 ^ (x0 & (uint32_t)0x90BF8816);
 262         y19 = (uint32_t)0x5AB2750A ^ (x0 & (uint32_t)0x09E34F9B);
 263         y20 = (uint32_t)0x5391BE65 ^ (x0 & (uint32_t)0x0103BE88);
 264         y21 = (uint32_t)0x93372BAF ^ (x0 & (uint32_t)0x49AC8E25);
 265         y22 = (uint32_t)0xF288210C ^ (x0 & (uint32_t)0x922C313D);
 266         y23 = (uint32_t)0x920AF5C0 ^ (x0 & (uint32_t)0x70EF31B0);
 267         y24 = (uint32_t)0x63D312C0 ^ (x0 & (uint32_t)0x6A707100);
 268         y25 = (uint32_t)0x537B3006 ^ (x0 & (uint32_t)0xB97C9011);
 269         y26 = (uint32_t)0xA2EFB0A5 ^ (x0 & (uint32_t)0xA320C959);
 270         y27 = (uint32_t)0xBC8F96A5 ^ (x0 & (uint32_t)0x6EA0AB4A);
 271         y28 = (uint32_t)0xFAD176A5 ^ (x0 & (uint32_t)0x6953DDF8);
 272         y29 = (uint32_t)0x665A14A3 ^ (x0 & (uint32_t)0xF74F3E2B);
 273         y30 = (uint32_t)0xF2EFF0CC ^ (x0 & (uint32_t)0xF0306CAD);
 274         /* y31 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
 275
 276         y0 = y0 ^ (x1 & y1);
 277         y1 = y2 ^ (x1 & y3);
 278         y2 = y4 ^ (x1 & y5);
 279         y3 = y6 ^ (x1 & y7);
 280         y4 = y8 ^ (x1 & y9);
 281         y5 = y10 ^ (x1 & y11);
 282         y6 = y12 ^ (x1 & y13);
 283         y7 = y14; /* was: y14 ^ (x1 & y15) */
 284         y8 = y16 ^ (x1 & y17);
 285         y9 = y18 ^ (x1 & y19);
 286         y10 = y20 ^ (x1 & y21);
 287         y11 = y22 ^ (x1 & y23);
 288         y12 = y24 ^ (x1 & y25);
 289         y13 = y26 ^ (x1 & y27);
 290         y14 = y28 ^ (x1 & y29);
 291         y15 = y30; /* was: y30 ^ (x1 & y31) */
 292
 293         y0 = y0 ^ (x2 & y1);
 294         y1 = y2 ^ (x2 & y3);
 295         y2 = y4 ^ (x2 & y5);
 296         y3 = y6 ^ (x2 & y7);
 297         y4 = y8 ^ (x2 & y9);
 298         y5 = y10 ^ (x2 & y11);
 299         y6 = y12 ^ (x2 & y13);
 300         y7 = y14 ^ (x2 & y15);
 301
 302         y0 = y0 ^ (x3 & y1);
 303         y1 = y2 ^ (x3 & y3);
 304         y2 = y4 ^ (x3 & y5);
 305         y3 = y6 ^ (x3 & y7);
 306
 307         y0 = y0 ^ (x4 & y1);
 308         y1 = y2 ^ (x4 & y3);
 309
 310         y0 = y0 ^ (x5 & y1);
 311
 312         /*
 313          * The P permutation:
 314          * -- Each bit move is converted into a mask + left rotation.
 315          * -- Rotations that use the same movement are coalesced together.
 316          * -- Left and right shifts are used as alternatives to a rotation
 317          * where appropriate (this will help architectures that do not have
 318          * a rotation opcode).
 319          */
 320         z0 = (y0 & (uint32_t)0x00000004) << 3;
 321         z0 |= (y0 & (uint32_t)0x00004000) << 4;
 322         z0 |= rotl(y0 & 0x12020120, 5);
 323         z0 |= (y0 & (uint32_t)0x00100000) << 6;
 324         z0 |= (y0 & (uint32_t)0x00008000) << 9;
 325         z0 |= (y0 & (uint32_t)0x04000000) >> 22;
 326         z0 |= (y0 & (uint32_t)0x00000001) << 11;
 327         z0 |= rotl(y0 & 0x20000200, 12);
 328         z0 |= (y0 & (uint32_t)0x00200000) >> 19;
 329         z0 |= (y0 & (uint32_t)0x00000040) << 14;
 330         z0 |= (y0 & (uint32_t)0x00010000) << 15;
 331         z0 |= (y0 & (uint32_t)0x00000002) << 16;
 332         z0 |= rotl(y0 & 0x40801800, 17);
 333         z0 |= (y0 & (uint32_t)0x00080000) >> 13;
 334         z0 |= (y0 & (uint32_t)0x00000010) << 21;
 335         z0 |= (y0 & (uint32_t)0x01000000) >> 10;
 336         z0 |= rotl(y0 & 0x88000008, 24);
 337         z0 |= (y0 & (uint32_t)0x00000480) >> 7;
 338         z0 |= (y0 & (uint32_t)0x00442000) >> 6;
 339         return z0;
 340 }
 341
 342 /*
 343  * Process one block through 16 successive rounds, omitting the swap
 344  * in the final round.
 345  */
 346 static void
 347 process_block_unit(uint32_t *pl, uint32_t *pr, const uint32_t *sk_exp)
 348 {
 349         int i;
 350         uint32_t l, r;
 351
 352         l = *pl;
 353         r = *pr;
 354         for (i = 0; i < 16; i ++) {
 355                 uint32_t t;
 356
 357                 t = l ^ Fconf(r, sk_exp);
 358                 l = r;
 359                 r = t;
 360                 sk_exp += 6;
 361         }
 362         *pl = r;
 363         *pr = l;
 364 }
 365
 366 /* see inner.h */
 367 void
 368 br_des_ct_process_block(unsigned num_rounds,
 369         const uint32_t *sk_exp, void *block)
 370 {
 371         unsigned char *buf;
 372         uint32_t l, r;
 373
 374         buf = block;
 375         l = br_dec32be(buf);
 376         r = br_dec32be(buf + 4);
 377         br_des_do_IP(&l, &r);
 378         while (num_rounds -- > 0) {
 379                 process_block_unit(&l, &r, sk_exp);
 380                 sk_exp += 96;
 381         }
 382         br_des_do_invIP(&l, &r);
 383         br_enc32be(buf, l);
 384         br_enc32be(buf + 4, r);
 385 }
 386
 387 /* see inner.h */
 388 void
 389 br_des_ct_skey_expand(uint32_t *sk_exp,
 390         unsigned num_rounds, const uint32_t *skey)
 391 {
 392         num_rounds <<= 4;
 393         while (num_rounds -- > 0) {
 394                 uint32_t v, w0, w1, w2, w3;
 395
 396                 v = *skey ++;
 397                 w0 = v & 0x11111111;
 398                 w1 = (v >> 1) & 0x11111111;
 399                 w2 = (v >> 2) & 0x11111111;
 400                 w3 = (v >> 3) & 0x11111111;
 401                 *sk_exp ++ = (w0 << 4) - w0;
 402                 *sk_exp ++ = (w1 << 4) - w1;
 403                 *sk_exp ++ = (w2 << 4) - w2;
 404                 *sk_exp ++ = (w3 << 4) - w3;
 405                 v = *skey ++;
 406                 w0 = v & 0x11111111;
 407                 w1 = (v >> 1) & 0x11111111;
 408                 *sk_exp ++ = (w0 << 4) - w0;
 409                 *sk_exp ++ = (w1 << 4) - w1;
 410         }
 411 }