contrib/bearssl/src/hash/ghash_pwr8.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #define BR_POWER_ASM_MACROS   1
  26 #include "inner.h"
  27
  28 /*
  29  * This is the GHASH implementation that leverages the POWER8 opcodes.
  30  */
  31
  32 #if BR_POWER8
  33
  34 /*
  35  * Some symbolic names for registers.
  36  *   HB0 = 16 bytes of value 0
  37  *   HB1 = 16 bytes of value 1
  38  *   HB2 = 16 bytes of value 2
  39  *   HB6 = 16 bytes of value 6
  40  *   HB7 = 16 bytes of value 7
  41  *   TT0, TT1 and TT2 are temporaries
  42  *
  43  * BSW holds the pattern for byteswapping 32-bit words; this is set only
  44  * on little-endian systems. XBSW is the same register with the +32 offset
  45  * for access with the VSX opcodes.
  46  */
  47 #define HB0     0
  48 #define HB1     1
  49 #define HB2     2
  50 #define HB6     3
  51 #define HB7     4
  52 #define TT0     5
  53 #define TT1     6
  54 #define TT2     7
  55
  56 #define BSW     8
  57 #define XBSW   40
  58
  59 /*
  60  * Macro to initialise the constants.
  61  */
  62 #define INIT \
  63                 vxor(HB0, HB0, HB0) \
  64                 vspltisb(HB1, 1) \
  65                 vspltisb(HB2, 2) \
  66                 vspltisb(HB6, 6) \
  67                 vspltisb(HB7, 7) \
  68                 INIT_BSW
  69
  70 /*
  71  * Fix endianness of a value after reading it or before writing it, if
  72  * necessary.
  73  */
  74 #if BR_POWER8_LE
  75 #define INIT_BSW         lxvw4x(XBSW, 0, %[idx2be])
  76 #define FIX_ENDIAN(xx)   vperm(xx, xx, xx, BSW)
  77 #else
  78 #define INIT_BSW
  79 #define FIX_ENDIAN(xx)
  80 #endif
  81
  82 /*
  83  * Left-shift x0:x1 by one bit to the left. This is a corrective action
  84  * needed because GHASH is defined in full little-endian specification,
  85  * while the opcodes use full big-endian convention, so the 255-bit product
  86  * ends up one bit to the right.
  87  */
  88 #define SL_256(x0, x1) \
  89                 vsldoi(TT0, HB0, x1, 1) \
  90                 vsl(x0, x0, HB1) \
  91                 vsr(TT0, TT0, HB7) \
  92                 vsl(x1, x1, HB1) \
  93                 vxor(x0, x0, TT0)
  94
  95 /*
  96  * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as
  97  * x0 or x1, or a different register). x0 and x1 are modified.
  98  */
  99 #define REDUCE_F128(xd, x0, x1) \
 100                 vxor(x0, x0, x1) \
 101                 vsr(TT0, x1, HB1) \
 102                 vsr(TT1, x1, HB2) \
 103                 vsr(TT2, x1, HB7) \
 104                 vxor(x0, x0, TT0) \
 105                 vxor(TT1, TT1, TT2) \
 106                 vxor(x0, x0, TT1) \
 107                 vsldoi(x1, x1, HB0, 15) \
 108                 vsl(TT1, x1, HB6) \
 109                 vsl(TT2, x1, HB1) \
 110                 vxor(x1, TT1, TT2) \
 111                 vsr(TT0, x1, HB1) \
 112                 vsr(TT1, x1, HB2) \
 113                 vsr(TT2, x1, HB7) \
 114                 vxor(x0, x0, x1) \
 115                 vxor(x0, x0, TT0) \
 116                 vxor(TT1, TT1, TT2) \
 117                 vxor(xd, x0, TT1)
 118
 119 /* see bearssl_hash.h */
 120 void
 121 br_ghash_pwr8(void *y, const void *h, const void *data, size_t len)
 122 {
 123         const unsigned char *buf1, *buf2;
 124         size_t num4, num1;
 125         unsigned char tmp[64];
 126         long cc0, cc1, cc2, cc3;
 127
 128 #if BR_POWER8_LE
 129         static const uint32_t idx2be[] = {
 130                 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
 131         };
 132 #endif
 133
 134         buf1 = data;
 135
 136         /*
 137          * Assembly code requires data into two chunks; first chunk
 138          * must contain a number of blocks which is a multiple of 4.
 139          * Since the processing for the first chunk is faster, we want
 140          * to make it as big as possible.
 141          *
 142          * For the remainder, there are two possibilities:
 143          *  -- if the remainder size is a multiple of 16, then use it
 144          *     in place;
 145          *  -- otherwise, copy it to the tmp[] array and pad it with
 146          *     zeros.
 147          */
 148         num4 = len >> 6;
 149         buf2 = buf1 + (num4 << 6);
 150         len &= 63;
 151         num1 = (len + 15) >> 4;
 152         if ((len & 15) != 0) {
 153                 memcpy(tmp, buf2, len);
 154                 memset(tmp + len, 0, (num1 << 4) - len);
 155                 buf2 = tmp;
 156         }
 157
 158         cc0 =  0;
 159         cc1 = 16;
 160         cc2 = 32;
 161         cc3 = 48;
 162         asm volatile (
 163                 INIT
 164
 165                 /*
 166                  * Load current h (denoted hereafter h1) in v9.
 167                  */
 168                 lxvw4x(41, 0, %[h])
 169                 FIX_ENDIAN(9)
 170
 171                 /*
 172                  * Load current y into v28.
 173                  */
 174                 lxvw4x(60, 0, %[y])
 175                 FIX_ENDIAN(28)
 176
 177                 /*
 178                  * Split h1 into three registers:
 179                  *   v17 = h1_1:h1_0
 180                  *   v18 =    0:h1_0
 181                  *   v19 = h1_1:0
 182                  */
 183                 xxpermdi(49, 41, 41, 2)
 184                 vsldoi(18, HB0, 9, 8)
 185                 vsldoi(19, 9, HB0, 8)
 186
 187                 /*
 188                  * If num4 is 0, skip directly to the second chunk.
 189                  */
 190                 cmpldi(%[num4], 0)
 191                 beq(chunk1)
 192
 193                 /*
 194                  * Compute h2 = h*h in v10.
 195                  */
 196                 vpmsumd(10, 18, 18)
 197                 vpmsumd(11, 19, 19)
 198                 SL_256(10, 11)
 199                 REDUCE_F128(10, 10, 11)
 200
 201                 /*
 202                  * Compute h3 = h*h*h in v11.
 203                  * We first split h2 into:
 204                  *   v10 = h2_0:h2_1
 205                  *   v11 =    0:h2_0
 206                  *   v12 = h2_1:0
 207                  * Then we do the product with h1, and reduce into v11.
 208                  */
 209                 vsldoi(11, HB0, 10, 8)
 210                 vsldoi(12, 10, HB0, 8)
 211                 vpmsumd(13, 10, 17)
 212                 vpmsumd(11, 11, 18)
 213                 vpmsumd(12, 12, 19)
 214                 vsldoi(14, HB0, 13, 8)
 215                 vsldoi(15, 13, HB0, 8)
 216                 vxor(11, 11, 14)
 217                 vxor(12, 12, 15)
 218                 SL_256(11, 12)
 219                 REDUCE_F128(11, 11, 12)
 220
 221                 /*
 222                  * Compute h4 = h*h*h*h in v12. This is done by squaring h2.
 223                  */
 224                 vsldoi(12, HB0, 10, 8)
 225                 vsldoi(13, 10, HB0, 8)
 226                 vpmsumd(12, 12, 12)
 227                 vpmsumd(13, 13, 13)
 228                 SL_256(12, 13)
 229                 REDUCE_F128(12, 12, 13)
 230
 231                 /*
 232                  * Repack h1, h2, h3 and h4:
 233                  *   v13 = h4_0:h3_0
 234                  *   v14 = h4_1:h3_1
 235                  *   v15 = h2_0:h1_0
 236                  *   v16 = h2_1:h1_1
 237                  */
 238                 xxpermdi(45, 44, 43, 0)
 239                 xxpermdi(46, 44, 43, 3)
 240                 xxpermdi(47, 42, 41, 0)
 241                 xxpermdi(48, 42, 41, 3)
 242
 243                 /*
 244                  * Loop for each group of four blocks.
 245                  */
 246                 mtctr(%[num4])
 247         label(loop4)
 248                 /*
 249                  * Read the four next blocks.
 250                  *   v20 = y + a0 = b0
 251                  *   v21 = a1     = b1
 252                  *   v22 = a2     = b2
 253                  *   v23 = a3     = b3
 254                  */
 255                 lxvw4x(52, %[cc0], %[buf1])
 256                 lxvw4x(53, %[cc1], %[buf1])
 257                 lxvw4x(54, %[cc2], %[buf1])
 258                 lxvw4x(55, %[cc3], %[buf1])
 259                 FIX_ENDIAN(20)
 260                 FIX_ENDIAN(21)
 261                 FIX_ENDIAN(22)
 262                 FIX_ENDIAN(23)
 263                 addi(%[buf1], %[buf1], 64)
 264                 vxor(20, 20, 28)
 265
 266                 /*
 267                  * Repack the blocks into v9, v10, v11 and v12.
 268                  *   v9  = b0_0:b1_0
 269                  *   v10 = b0_1:b1_1
 270                  *   v11 = b2_0:b3_0
 271                  *   v12 = b2_1:b3_1
 272                  */
 273                 xxpermdi(41, 52, 53, 0)
 274                 xxpermdi(42, 52, 53, 3)
 275                 xxpermdi(43, 54, 55, 0)
 276                 xxpermdi(44, 54, 55, 3)
 277
 278                 /*
 279                  * Compute the products.
 280                  *   v20 = b0_0*h4_0 + b1_0*h3_0
 281                  *   v21 = b0_1*h4_0 + b1_1*h3_0
 282                  *   v22 = b0_0*h4_1 + b1_0*h3_1
 283                  *   v23 = b0_1*h4_1 + b1_1*h3_1
 284                  *   v24 = b2_0*h2_0 + b3_0*h1_0
 285                  *   v25 = b2_1*h2_0 + b3_1*h1_0
 286                  *   v26 = b2_0*h2_1 + b3_0*h1_1
 287                  *   v27 = b2_1*h2_1 + b3_1*h1_1
 288                  */
 289                 vpmsumd(20, 13,  9)
 290                 vpmsumd(21, 13, 10)
 291                 vpmsumd(22, 14,  9)
 292                 vpmsumd(23, 14, 10)
 293                 vpmsumd(24, 15, 11)
 294                 vpmsumd(25, 15, 12)
 295                 vpmsumd(26, 16, 11)
 296                 vpmsumd(27, 16, 12)
 297
 298                 /*
 299                  * Sum products into a single 256-bit result in v11:v12.
 300                  */
 301                 vxor(11, 20, 24)
 302                 vxor(12, 23, 27)
 303                 vxor( 9, 21, 22)
 304                 vxor(10, 25, 26)
 305                 vxor(20,  9, 10)
 306                 vsldoi( 9, HB0, 20, 8)
 307                 vsldoi(10, 20, HB0, 8)
 308                 vxor(11, 11, 9)
 309                 vxor(12, 12, 10)
 310
 311                 /*
 312                  * Fix and reduce in GF(2^128); this is the new y (in v28).
 313                  */
 314                 SL_256(11, 12)
 315                 REDUCE_F128(28, 11, 12)
 316
 317                 /*
 318                  * Loop for next group of four blocks.
 319                  */
 320                 bdnz(loop4)
 321
 322                 /*
 323                  * Process second chunk, one block at a time.
 324                  */
 325         label(chunk1)
 326                 cmpldi(%[num1], 0)
 327                 beq(done)
 328
 329                 mtctr(%[num1])
 330         label(loop1)
 331                 /*
 332                  * Load next data block and XOR it into y.
 333                  */
 334                 lxvw4x(41, 0, %[buf2])
 335 #if BR_POWER8_LE
 336                 FIX_ENDIAN(9)
 337 #endif
 338                 addi(%[buf2], %[buf2], 16)
 339                 vxor(9, 28, 9)
 340
 341                 /*
 342                  * Split y into doublewords:
 343                  *   v9  = y_0:y_1
 344                  *   v10 =   0:y_0
 345                  *   v11 = y_1:0
 346                  */
 347                 vsldoi(10, HB0, 9, 8)
 348                 vsldoi(11, 9, HB0, 8)
 349
 350                 /*
 351                  * Compute products with h:
 352                  *   v12 = y_0 * h_0
 353                  *   v13 = y_1 * h_1
 354                  *   v14 = y_1 * h_0 + y_0 * h_1
 355                  */
 356                 vpmsumd(14,  9, 17)
 357                 vpmsumd(12, 10, 18)
 358                 vpmsumd(13, 11, 19)
 359
 360                 /*
 361                  * Propagate v14 into v12:v13 to finalise product.
 362                  */
 363                 vsldoi(10, HB0, 14, 8)
 364                 vsldoi(11, 14, HB0, 8)
 365                 vxor(12, 12, 10)
 366                 vxor(13, 13, 11)
 367
 368                 /*
 369                  * Fix result and reduce into v28 (next value for y).
 370                  */
 371                 SL_256(12, 13)
 372                 REDUCE_F128(28, 12, 13)
 373                 bdnz(loop1)
 374
 375         label(done)
 376                 /*
 377                  * Write back the new y.
 378                  */
 379                 FIX_ENDIAN(28)
 380                 stxvw4x(60, 0, %[y])
 381
 382 : [buf1] "+b" (buf1), [buf2] "+b" (buf2)
 383 : [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1),
 384   [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3)
 385 #if BR_POWER8_LE
 386         , [idx2be] "b" (idx2be)
 387 #endif
 388 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
 389   "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
 390   "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
 391   "ctr", "memory"
 392         );
 393 }
 394
 395 /* see bearssl_hash.h */
 396 br_ghash
 397 br_ghash_pwr8_get(void)
 398 {
 399         return &br_ghash_pwr8;
 400 }
 401
 402 #else
 403
 404 /* see bearssl_hash.h */
 405 br_ghash
 406 br_ghash_pwr8_get(void)
 407 {
 408         return 0;
 409 }
 410
 411 #endif