2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #define BR_POWER_ASM_MACROS 1
29 * This is the GHASH implementation that leverages the POWER8 opcodes.
35 * Some symbolic names for registers.
36 * HB0 = 16 bytes of value 0
37 * HB1 = 16 bytes of value 1
38 * HB2 = 16 bytes of value 2
39 * HB6 = 16 bytes of value 6
40 * HB7 = 16 bytes of value 7
41 * TT0, TT1 and TT2 are temporaries
43 * BSW holds the pattern for byteswapping 32-bit words; this is set only
44 * on little-endian systems. XBSW is the same register with the +32 offset
45 * for access with the VSX opcodes.
60 * Macro to initialise the constants.
71 * Fix endianness of a value after reading it or before writing it, if
75 #define INIT_BSW lxvw4x(XBSW, 0, %[idx2be])
76 #define FIX_ENDIAN(xx) vperm(xx, xx, xx, BSW)
79 #define FIX_ENDIAN(xx)
83 * Left-shift x0:x1 by one bit to the left. This is a corrective action
84 * needed because GHASH is defined in full little-endian specification,
85 * while the opcodes use full big-endian convention, so the 255-bit product
86 * ends up one bit to the right.
88 #define SL_256(x0, x1) \
89 vsldoi(TT0, HB0, x1, 1) \
96 * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as
97 * x0 or x1, or a different register). x0 and x1 are modified.
99 #define REDUCE_F128(xd, x0, x1) \
105 vxor(TT1, TT1, TT2) \
107 vsldoi(x1, x1, HB0, 15) \
116 vxor(TT1, TT1, TT2) \
119 /* see bearssl_hash.h */
121 br_ghash_pwr8(void *y, const void *h, const void *data, size_t len)
123 const unsigned char *buf1, *buf2;
125 unsigned char tmp[64];
126 long cc0, cc1, cc2, cc3;
129 static const uint32_t idx2be[] = {
130 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
137 * Assembly code requires data into two chunks; first chunk
138 * must contain a number of blocks which is a multiple of 4.
139 * Since the processing for the first chunk is faster, we want
140 * to make it as big as possible.
142 * For the remainder, there are two possibilities:
143 * -- if the remainder size is a multiple of 16, then use it
145 * -- otherwise, copy it to the tmp[] array and pad it with
149 buf2 = buf1 + (num4 << 6);
151 num1 = (len + 15) >> 4;
152 if ((len & 15) != 0) {
153 memcpy(tmp, buf2, len);
154 memset(tmp + len, 0, (num1 << 4) - len);
166 * Load current h (denoted hereafter h1) in v9.
172 * Load current y into v28.
178 * Split h1 into three registers:
183 xxpermdi(49, 41, 41, 2)
184 vsldoi(18, HB0, 9, 8)
185 vsldoi(19, 9, HB0, 8)
188 * If num4 is 0, skip directly to the second chunk.
194 * Compute h2 = h*h in v10.
199 REDUCE_F128(10, 10, 11)
202 * Compute h3 = h*h*h in v11.
203 * We first split h2 into:
207 * Then we do the product with h1, and reduce into v11.
209 vsldoi(11, HB0, 10, 8)
210 vsldoi(12, 10, HB0, 8)
214 vsldoi(14, HB0, 13, 8)
215 vsldoi(15, 13, HB0, 8)
219 REDUCE_F128(11, 11, 12)
222 * Compute h4 = h*h*h*h in v12. This is done by squaring h2.
224 vsldoi(12, HB0, 10, 8)
225 vsldoi(13, 10, HB0, 8)
229 REDUCE_F128(12, 12, 13)
232 * Repack h1, h2, h3 and h4:
238 xxpermdi(45, 44, 43, 0)
239 xxpermdi(46, 44, 43, 3)
240 xxpermdi(47, 42, 41, 0)
241 xxpermdi(48, 42, 41, 3)
244 * Loop for each group of four blocks.
249 * Read the four next blocks.
255 lxvw4x(52, %[cc0], %[buf1])
256 lxvw4x(53, %[cc1], %[buf1])
257 lxvw4x(54, %[cc2], %[buf1])
258 lxvw4x(55, %[cc3], %[buf1])
263 addi(%[buf1], %[buf1], 64)
267 * Repack the blocks into v9, v10, v11 and v12.
273 xxpermdi(41, 52, 53, 0)
274 xxpermdi(42, 52, 53, 3)
275 xxpermdi(43, 54, 55, 0)
276 xxpermdi(44, 54, 55, 3)
279 * Compute the products.
280 * v20 = b0_0*h4_0 + b1_0*h3_0
281 * v21 = b0_1*h4_0 + b1_1*h3_0
282 * v22 = b0_0*h4_1 + b1_0*h3_1
283 * v23 = b0_1*h4_1 + b1_1*h3_1
284 * v24 = b2_0*h2_0 + b3_0*h1_0
285 * v25 = b2_1*h2_0 + b3_1*h1_0
286 * v26 = b2_0*h2_1 + b3_0*h1_1
287 * v27 = b2_1*h2_1 + b3_1*h1_1
299 * Sum products into a single 256-bit result in v11:v12.
306 vsldoi( 9, HB0, 20, 8)
307 vsldoi(10, 20, HB0, 8)
312 * Fix and reduce in GF(2^128); this is the new y (in v28).
315 REDUCE_F128(28, 11, 12)
318 * Loop for next group of four blocks.
323 * Process second chunk, one block at a time.
332 * Load next data block and XOR it into y.
334 lxvw4x(41, 0, %[buf2])
338 addi(%[buf2], %[buf2], 16)
342 * Split y into doublewords:
347 vsldoi(10, HB0, 9, 8)
348 vsldoi(11, 9, HB0, 8)
351 * Compute products with h:
354 * v14 = y_1 * h_0 + y_0 * h_1
361 * Propagate v14 into v12:v13 to finalise product.
363 vsldoi(10, HB0, 14, 8)
364 vsldoi(11, 14, HB0, 8)
369 * Fix result and reduce into v28 (next value for y).
372 REDUCE_F128(28, 12, 13)
377 * Write back the new y.
382 : [buf1] "+b" (buf1), [buf2] "+b" (buf2)
383 : [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1),
384 [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3)
386 , [idx2be] "b" (idx2be)
388 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
389 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
390 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
395 /* see bearssl_hash.h */
397 br_ghash_pwr8_get(void)
399 return &br_ghash_pwr8;
404 /* see bearssl_hash.h */
406 br_ghash_pwr8_get(void)