]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
Merge llvm-project 13.0.0 release
[FreeBSD/FreeBSD.git] / sys / contrib / openzfs / module / zfs / vdev_raidz_math_aarch64_neon_common.h
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
23  */
24
25 #include <sys/types.h>
26 #include <sys/simd.h>
27
28 #ifdef __linux__
29 #define __asm __asm__ __volatile__
30 #endif
31
32 #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
33 #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
34
35 #define VR0_(REG, ...) "%[w"#REG"]"
36 #define VR1_(_1, REG, ...) "%[w"#REG"]"
37 #define VR2_(_1, _2, REG, ...) "%[w"#REG"]"
38 #define VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
39 #define VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
40 #define VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
41 #define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
42 #define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
43
44 /*
45  * Here we need registers not used otherwise.
46  * They will be used in unused ASM for the case
47  * with more registers than required... but GCC
48  * will still need to make sure the constraints
49  * are correct, and duplicate constraints are illegal
50  * ... and we use the "register" number as a name
51  */
52
53 #define VR0(r...) VR0_(r)
54 #define VR1(r...) VR1_(r)
55 #define VR2(r...) VR2_(r, 36)
56 #define VR3(r...) VR3_(r, 36, 35)
57 #define VR4(r...) VR4_(r, 36, 35, 34, 33)
58 #define VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
59 #define VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
60 #define VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
61
62 #define VR(X) "%[w"#X"]"
63
64 #define RVR0_(REG, ...) [w##REG] "w" (w##REG)
65 #define RVR1_(_1, REG, ...) [w##REG] "w" (w##REG)
66 #define RVR2_(_1, _2, REG, ...) [w##REG] "w" (w##REG)
67 #define RVR3_(_1, _2, _3, REG, ...) [w##REG] "w" (w##REG)
68 #define RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "w" (w##REG)
69 #define RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "w" (w##REG)
70 #define RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "w" (w##REG)
71 #define RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "w" (w##REG)
72
73 #define RVR0(r...) RVR0_(r)
74 #define RVR1(r...) RVR1_(r)
75 #define RVR2(r...) RVR2_(r, 36)
76 #define RVR3(r...) RVR3_(r, 36, 35)
77 #define RVR4(r...) RVR4_(r, 36, 35, 34, 33)
78 #define RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
79 #define RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
80 #define RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
81
82 #define RVR(X) [w##X] "w" (w##X)
83
84 #define WVR0_(REG, ...) [w##REG] "=w" (w##REG)
85 #define WVR1_(_1, REG, ...) [w##REG] "=w" (w##REG)
86 #define WVR2_(_1, _2, REG, ...) [w##REG] "=w" (w##REG)
87 #define WVR3_(_1, _2, _3, REG, ...) [w##REG] "=w" (w##REG)
88 #define WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=w" (w##REG)
89 #define WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=w" (w##REG)
90 #define WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=w" (w##REG)
91 #define WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=w" (w##REG)
92
93 #define WVR0(r...) WVR0_(r)
94 #define WVR1(r...) WVR1_(r)
95 #define WVR2(r...) WVR2_(r, 36)
96 #define WVR3(r...) WVR3_(r, 36, 35)
97 #define WVR4(r...) WVR4_(r, 36, 35, 34, 33)
98 #define WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
99 #define WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
100 #define WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
101
102 #define WVR(X) [w##X] "=w" (w##X)
103
104 #define UVR0_(REG, ...) [w##REG] "+&w" (w##REG)
105 #define UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG)
106 #define UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG)
107 #define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG)
108 #define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG)
109 #define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG)
110 #define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG)
111 #define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG)
112
113 #define UVR0(r...) UVR0_(r)
114 #define UVR1(r...) UVR1_(r)
115 #define UVR2(r...) UVR2_(r, 36)
116 #define UVR3(r...) UVR3_(r, 36, 35)
117 #define UVR4(r...) UVR4_(r, 36, 35, 34, 33)
118 #define UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
119 #define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
120 #define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
121
122 #define UVR(X) [w##X] "+&w" (w##X)
123
124 #define R_01(REG1, REG2, ...) REG1, REG2
125 #define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
126 #define R_23(REG...) _R_23(REG, 1, 2, 3)
127
128 #define ZFS_ASM_BUG()   ASSERT(0)
129
130 #define OFFSET(ptr, val)        (((unsigned char *)(ptr))+val)
131
132 extern const uint8_t gf_clmul_mod_lt[4*256][16];
133
134 #define ELEM_SIZE 16
135
136 typedef struct v {
137         uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
138 } v_t;
139
140 #define XOR_ACC(src, r...)                                              \
141 {                                                                       \
142         switch (REG_CNT(r)) {                                           \
143         case 8:                                                         \
144                 __asm(                                                  \
145                 "ld1 { v21.4s },%[SRC0]\n"                              \
146                 "ld1 { v20.4s },%[SRC1]\n"                              \
147                 "ld1 { v19.4s },%[SRC2]\n"                              \
148                 "ld1 { v18.4s },%[SRC3]\n"                              \
149                 "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"           \
150                 "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"           \
151                 "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n"           \
152                 "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n"           \
153                 "ld1 { v21.4s },%[SRC4]\n"                              \
154                 "ld1 { v20.4s },%[SRC5]\n"                              \
155                 "ld1 { v19.4s },%[SRC6]\n"                              \
156                 "ld1 { v18.4s },%[SRC7]\n"                              \
157                 "eor " VR4(r) ".16b," VR4(r) ".16b,v21.16b\n"           \
158                 "eor " VR5(r) ".16b," VR5(r) ".16b,v20.16b\n"           \
159                 "eor " VR6(r) ".16b," VR6(r) ".16b,v19.16b\n"           \
160                 "eor " VR7(r) ".16b," VR7(r) ".16b,v18.16b\n"           \
161                 :       UVR0(r), UVR1(r), UVR2(r), UVR3(r),             \
162                         UVR4(r), UVR5(r), UVR6(r), UVR7(r)              \
163                 :       [SRC0] "Q" (*(OFFSET(src, 0))),                 \
164                 [SRC1] "Q" (*(OFFSET(src, 16))),                        \
165                 [SRC2] "Q" (*(OFFSET(src, 32))),                        \
166                 [SRC3] "Q" (*(OFFSET(src, 48))),                        \
167                 [SRC4] "Q" (*(OFFSET(src, 64))),                        \
168                 [SRC5] "Q" (*(OFFSET(src, 80))),                        \
169                 [SRC6] "Q" (*(OFFSET(src, 96))),                        \
170                 [SRC7] "Q" (*(OFFSET(src, 112)))                        \
171                 :       "v18", "v19", "v20", "v21");                    \
172                 break;                                                  \
173         case 4:                                                         \
174                 __asm(                                                  \
175                 "ld1 { v21.4s },%[SRC0]\n"                              \
176                 "ld1 { v20.4s },%[SRC1]\n"                              \
177                 "ld1 { v19.4s },%[SRC2]\n"                              \
178                 "ld1 { v18.4s },%[SRC3]\n"                              \
179                 "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"           \
180                 "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"           \
181                 "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n"           \
182                 "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n"           \
183                 :       UVR0(r), UVR1(r), UVR2(r), UVR3(r)              \
184                 :       [SRC0] "Q" (*(OFFSET(src, 0))),                 \
185                 [SRC1] "Q" (*(OFFSET(src, 16))),                        \
186                 [SRC2] "Q" (*(OFFSET(src, 32))),                        \
187                 [SRC3] "Q" (*(OFFSET(src, 48)))                         \
188                 :       "v18", "v19", "v20", "v21");                    \
189                 break;                                                  \
190         case 2:                                                         \
191                 __asm(                                                  \
192                 "ld1 { v21.4s },%[SRC0]\n"                              \
193                 "ld1 { v20.4s },%[SRC1]\n"                              \
194                 "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"           \
195                 "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"           \
196                 :       UVR0(r), UVR1(r)                                \
197                 :       [SRC0] "Q" (*(OFFSET(src, 0))),                 \
198                 [SRC1] "Q" (*(OFFSET(src, 16)))                         \
199                 :       "v20", "v21");                                  \
200                 break;                                                  \
201         default:                                                        \
202                 ZFS_ASM_BUG();                                          \
203         }                                                               \
204 }
205
206 #define XOR(r...)                                                       \
207 {                                                                       \
208         switch (REG_CNT(r)) {                                           \
209         case 8:                                                         \
210                 __asm(                                                  \
211                 "eor " VR4(r) ".16b," VR4(r) ".16b," VR0(r) ".16b\n"    \
212                 "eor " VR5(r) ".16b," VR5(r) ".16b," VR1(r) ".16b\n"    \
213                 "eor " VR6(r) ".16b," VR6(r) ".16b," VR2(r) ".16b\n"    \
214                 "eor " VR7(r) ".16b," VR7(r) ".16b," VR3(r) ".16b\n"    \
215                 :       UVR4(r), UVR5(r), UVR6(r), UVR7(r)              \
216                 :       RVR0(r), RVR1(r), RVR2(r), RVR3(r));            \
217                 break;                                                  \
218         case 4:                                                         \
219                 __asm(                                                  \
220                 "eor " VR2(r) ".16b," VR2(r) ".16b," VR0(r) ".16b\n"    \
221                 "eor " VR3(r) ".16b," VR3(r) ".16b," VR1(r) ".16b\n"    \
222                 :       UVR2(r), UVR3(r)                                \
223                 :       RVR0(r), RVR1(r));                              \
224                 break;                                                  \
225         default:                                                        \
226                 ZFS_ASM_BUG();                                          \
227         }                                                               \
228 }
229
230 #define ZERO(r...)                                                      \
231 {                                                                       \
232         switch (REG_CNT(r)) {                                           \
233         case 8:                                                         \
234                 __asm(                                                  \
235                 "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"    \
236                 "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"    \
237                 "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n"    \
238                 "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n"    \
239                 "eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n"    \
240                 "eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n"    \
241                 "eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n"    \
242                 "eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n"    \
243                 :       WVR0(r), WVR1(r), WVR2(r), WVR3(r),             \
244                         WVR4(r), WVR5(r), WVR6(r), WVR7(r));            \
245                 break;                                                  \
246         case 4:                                                         \
247                 __asm(                                                  \
248                 "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"    \
249                 "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"    \
250                 "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n"    \
251                 "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n"    \
252                 :       WVR0(r), WVR1(r), WVR2(r), WVR3(r));            \
253                 break;                                                  \
254         case 2:                                                         \
255                 __asm(                                                  \
256                 "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"    \
257                 "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"    \
258                 :       WVR0(r), WVR1(r));                              \
259                 break;                                                  \
260         default:                                                        \
261                 ZFS_ASM_BUG();                                          \
262         }                                                               \
263 }
264
265 #define COPY(r...)                                                      \
266 {                                                                       \
267         switch (REG_CNT(r)) {                                           \
268         case 8:                                                         \
269                 __asm(                                                  \
270                 "mov " VR4(r) ".16b," VR0(r) ".16b\n"                   \
271                 "mov " VR5(r) ".16b," VR1(r) ".16b\n"                   \
272                 "mov " VR6(r) ".16b," VR2(r) ".16b\n"                   \
273                 "mov " VR7(r) ".16b," VR3(r) ".16b\n"                   \
274                 :       WVR4(r), WVR5(r), WVR6(r), WVR7(r)              \
275                 :       RVR0(r), RVR1(r), RVR2(r), RVR3(r));            \
276                 break;                                                  \
277         case 4:                                                         \
278                 __asm(                                                  \
279                 "mov " VR2(r) ".16b," VR0(r) ".16b\n"                   \
280                 "mov " VR3(r) ".16b," VR1(r) ".16b\n"                   \
281                 :       WVR2(r), WVR3(r)                                \
282                 :       RVR0(r), RVR1(r));                              \
283                 break;                                                  \
284         default:                                                        \
285                 ZFS_ASM_BUG();                                          \
286         }                                                               \
287 }
288
289 #define LOAD(src, r...)                                                 \
290 {                                                                       \
291         switch (REG_CNT(r)) {                                           \
292         case 8:                                                         \
293                 __asm(                                                  \
294                 "ld1 { " VR0(r) ".4s },%[SRC0]\n"                       \
295                 "ld1 { " VR1(r) ".4s },%[SRC1]\n"                       \
296                 "ld1 { " VR2(r) ".4s },%[SRC2]\n"                       \
297                 "ld1 { " VR3(r) ".4s },%[SRC3]\n"                       \
298                 "ld1 { " VR4(r) ".4s },%[SRC4]\n"                       \
299                 "ld1 { " VR5(r) ".4s },%[SRC5]\n"                       \
300                 "ld1 { " VR6(r) ".4s },%[SRC6]\n"                       \
301                 "ld1 { " VR7(r) ".4s },%[SRC7]\n"                       \
302                 :       WVR0(r), WVR1(r), WVR2(r), WVR3(r),             \
303                         WVR4(r), WVR5(r), WVR6(r), WVR7(r)              \
304                 :       [SRC0] "Q" (*(OFFSET(src, 0))),                 \
305                 [SRC1] "Q" (*(OFFSET(src, 16))),                        \
306                 [SRC2] "Q" (*(OFFSET(src, 32))),                        \
307                 [SRC3] "Q" (*(OFFSET(src, 48))),                        \
308                 [SRC4] "Q" (*(OFFSET(src, 64))),                        \
309                 [SRC5] "Q" (*(OFFSET(src, 80))),                        \
310                 [SRC6] "Q" (*(OFFSET(src, 96))),                        \
311                 [SRC7] "Q" (*(OFFSET(src, 112))));                      \
312                 break;                                                  \
313         case 4:                                                         \
314                 __asm(                                                  \
315                 "ld1 { " VR0(r) ".4s },%[SRC0]\n"                       \
316                 "ld1 { " VR1(r) ".4s },%[SRC1]\n"                       \
317                 "ld1 { " VR2(r) ".4s },%[SRC2]\n"                       \
318                 "ld1 { " VR3(r) ".4s },%[SRC3]\n"                       \
319                 :       WVR0(r), WVR1(r), WVR2(r), WVR3(r)              \
320                 :       [SRC0] "Q" (*(OFFSET(src, 0))),                 \
321                 [SRC1] "Q" (*(OFFSET(src, 16))),                        \
322                 [SRC2] "Q" (*(OFFSET(src, 32))),                        \
323                 [SRC3] "Q" (*(OFFSET(src, 48))));                       \
324                 break;                                                  \
325         case 2:                                                         \
326                 __asm(                                                  \
327                 "ld1 { " VR0(r) ".4s },%[SRC0]\n"                       \
328                 "ld1 { " VR1(r) ".4s },%[SRC1]\n"                       \
329                 :       WVR0(r), WVR1(r)                                \
330                 :       [SRC0] "Q" (*(OFFSET(src, 0))),                 \
331                 [SRC1] "Q" (*(OFFSET(src, 16))));                       \
332                 break;                                                  \
333         default:                                                        \
334                 ZFS_ASM_BUG();                                          \
335         }                                                               \
336 }
337
338 #define STORE(dst, r...)                                                \
339 {                                                                       \
340         switch (REG_CNT(r)) {                                           \
341         case 8:                                                         \
342                 __asm(                                                  \
343                 "st1 { " VR0(r) ".4s },%[DST0]\n"                       \
344                 "st1 { " VR1(r) ".4s },%[DST1]\n"                       \
345                 "st1 { " VR2(r) ".4s },%[DST2]\n"                       \
346                 "st1 { " VR3(r) ".4s },%[DST3]\n"                       \
347                 "st1 { " VR4(r) ".4s },%[DST4]\n"                       \
348                 "st1 { " VR5(r) ".4s },%[DST5]\n"                       \
349                 "st1 { " VR6(r) ".4s },%[DST6]\n"                       \
350                 "st1 { " VR7(r) ".4s },%[DST7]\n"                       \
351                 :       [DST0] "=Q" (*(OFFSET(dst, 0))),                \
352                 [DST1] "=Q" (*(OFFSET(dst, 16))),                       \
353                 [DST2] "=Q" (*(OFFSET(dst, 32))),                       \
354                 [DST3] "=Q" (*(OFFSET(dst, 48))),                       \
355                 [DST4] "=Q" (*(OFFSET(dst, 64))),                       \
356                 [DST5] "=Q" (*(OFFSET(dst, 80))),                       \
357                 [DST6] "=Q" (*(OFFSET(dst, 96))),                       \
358                 [DST7] "=Q" (*(OFFSET(dst, 112)))                       \
359                 :       RVR0(r), RVR1(r), RVR2(r), RVR3(r),             \
360                         RVR4(r), RVR5(r), RVR6(r), RVR7(r));            \
361                 break;                                                  \
362         case 4:                                                         \
363                 __asm(                                                  \
364                 "st1 { " VR0(r) ".4s },%[DST0]\n"                       \
365                 "st1 { " VR1(r) ".4s },%[DST1]\n"                       \
366                 "st1 { " VR2(r) ".4s },%[DST2]\n"                       \
367                 "st1 { " VR3(r) ".4s },%[DST3]\n"                       \
368                 :       [DST0] "=Q" (*(OFFSET(dst, 0))),                \
369                 [DST1] "=Q" (*(OFFSET(dst, 16))),                       \
370                 [DST2] "=Q" (*(OFFSET(dst, 32))),                       \
371                 [DST3] "=Q" (*(OFFSET(dst, 48)))                        \
372                 :       RVR0(r), RVR1(r), RVR2(r), RVR3(r));            \
373                 break;                                                  \
374         case 2:                                                         \
375                 __asm(                                                  \
376                 "st1 { " VR0(r) ".4s },%[DST0]\n"                       \
377                 "st1 { " VR1(r) ".4s },%[DST1]\n"                       \
378                 :       [DST0] "=Q" (*(OFFSET(dst, 0))),                \
379                 [DST1] "=Q" (*(OFFSET(dst, 16)))                        \
380                 :       RVR0(r), RVR1(r));                              \
381                 break;                                                  \
382         default:                                                        \
383                 ZFS_ASM_BUG();                                          \
384         }                                                               \
385 }
386
387 /*
388  * Unfortunately cannot use the macro, because GCC
389  * will try to use the macro name and not value
390  * later on...
391  * Kept as a reference to what a numbered variable is
392  */
393 #define _00     "v17"
394 #define _1d     "v16"
395 #define _temp0  "v19"
396 #define _temp1  "v18"
397
398 #define MUL2_SETUP()                                                    \
399 {                                                                       \
400         __asm(                                                          \
401         "eor " VR(17) ".16b," VR(17) ".16b," VR(17) ".16b\n"            \
402         "movi " VR(16) ".16b,#0x1d\n"                                   \
403         :       WVR(16), WVR(17));                                      \
404 }
405
406 #define MUL2(r...)                                                      \
407 {                                                                       \
408         switch (REG_CNT(r)) {                                           \
409         case 4:                                                         \
410                 __asm(                                                  \
411                 "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n"          \
412                 "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n"          \
413                 "cmgt v21.16b," VR(17) ".16b," VR2(r) ".16b\n"          \
414                 "cmgt v20.16b," VR(17) ".16b," VR3(r) ".16b\n"          \
415                 "and v19.16b,v19.16b," VR(16) ".16b\n"                  \
416                 "and v18.16b,v18.16b," VR(16) ".16b\n"                  \
417                 "and v21.16b,v21.16b," VR(16) ".16b\n"                  \
418                 "and v20.16b,v20.16b," VR(16) ".16b\n"                  \
419                 "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n"                \
420                 "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n"                \
421                 "shl " VR2(r) ".16b," VR2(r) ".16b,#1\n"                \
422                 "shl " VR3(r) ".16b," VR3(r) ".16b,#1\n"                \
423                 "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n"           \
424                 "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n"           \
425                 "eor " VR2(r) ".16b,v21.16b," VR2(r) ".16b\n"           \
426                 "eor " VR3(r) ".16b,v20.16b," VR3(r) ".16b\n"           \
427                 :       UVR0(r), UVR1(r), UVR2(r), UVR3(r)              \
428                 :       RVR(17), RVR(16)                                \
429                 :       "v18", "v19", "v20", "v21");                    \
430                 break;                                                  \
431         case 2:                                                         \
432                 __asm(                                                  \
433                 "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n"          \
434                 "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n"          \
435                 "and v19.16b,v19.16b," VR(16) ".16b\n"                  \
436                 "and v18.16b,v18.16b," VR(16) ".16b\n"                  \
437                 "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n"                \
438                 "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n"                \
439                 "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n"           \
440                 "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n"           \
441                 :       UVR0(r), UVR1(r)                                \
442                 :       RVR(17), RVR(16)                                \
443                 :       "v18", "v19");                                  \
444                 break;                                                  \
445         default:                                                        \
446                 ZFS_ASM_BUG();                                          \
447         }                                                               \
448 }
449
450 #define MUL4(r...)                                                      \
451 {                                                                       \
452         MUL2(r);                                                        \
453         MUL2(r);                                                        \
454 }
455
456 /*
457  * Unfortunately cannot use the macro, because GCC
458  * will try to use the macro name and not value
459  * later on...
460  * Kept as a reference to what a register is
461  * (here we're using actual registers for the
462  * clobbered ones)
463  */
464 #define _0f             "v15"
465 #define _a_save         "v14"
466 #define _b_save         "v13"
467 #define _lt_mod_a       "v12"
468 #define _lt_clmul_a     "v11"
469 #define _lt_mod_b       "v10"
470 #define _lt_clmul_b     "v15"
471
472 #define _MULx2(c, r...)                                                 \
473 {                                                                       \
474         switch (REG_CNT(r)) {                                           \
475         case 2:                                                         \
476                 __asm(                                                  \
477                 /* lts for upper part */                                \
478                 "movi v15.16b,#0x0f\n"                                  \
479                 "ld1 { v10.4s },%[lt0]\n"                               \
480                 "ld1 { v11.4s },%[lt1]\n"                               \
481                 /* upper part */                                        \
482                 "and v14.16b," VR0(r) ".16b,v15.16b\n"                  \
483                 "and v13.16b," VR1(r) ".16b,v15.16b\n"                  \
484                 "ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n"               \
485                 "ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n"               \
486                                                                         \
487                 "tbl v12.16b,{v10.16b}," VR0(r) ".16b\n"                \
488                 "tbl v10.16b,{v10.16b}," VR1(r) ".16b\n"                \
489                 "tbl v15.16b,{v11.16b}," VR0(r) ".16b\n"                \
490                 "tbl v11.16b,{v11.16b}," VR1(r) ".16b\n"                \
491                                                                         \
492                 "eor " VR0(r) ".16b,v15.16b,v12.16b\n"                  \
493                 "eor " VR1(r) ".16b,v11.16b,v10.16b\n"                  \
494                 /* lts for lower part */                                \
495                 "ld1 { v10.4s },%[lt2]\n"                               \
496                 "ld1 { v15.4s },%[lt3]\n"                               \
497                 /* lower part */                                        \
498                 "tbl v12.16b,{v10.16b},v14.16b\n"                       \
499                 "tbl v10.16b,{v10.16b},v13.16b\n"                       \
500                 "tbl v11.16b,{v15.16b},v14.16b\n"                       \
501                 "tbl v15.16b,{v15.16b},v13.16b\n"                       \
502                                                                         \
503                 "eor " VR0(r) ".16b," VR0(r) ".16b,v12.16b\n"           \
504                 "eor " VR1(r) ".16b," VR1(r) ".16b,v10.16b\n"           \
505                 "eor " VR0(r) ".16b," VR0(r) ".16b,v11.16b\n"           \
506                 "eor " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n"           \
507                 :       UVR0(r), UVR1(r)                                \
508                 :       [lt0] "Q" ((gf_clmul_mod_lt[4*(c)+0][0])),      \
509                 [lt1] "Q" ((gf_clmul_mod_lt[4*(c)+1][0])),              \
510                 [lt2] "Q" ((gf_clmul_mod_lt[4*(c)+2][0])),              \
511                 [lt3] "Q" ((gf_clmul_mod_lt[4*(c)+3][0]))               \
512                 :       "v10", "v11", "v12", "v13", "v14", "v15");      \
513                 break;                                                  \
514         default:                                                        \
515                 ZFS_ASM_BUG();                                          \
516         }                                                               \
517 }
518
519 #define MUL(c, r...)                                                    \
520 {                                                                       \
521         switch (REG_CNT(r)) {                                           \
522         case 4:                                                         \
523                 _MULx2(c, R_23(r));                                     \
524                 _MULx2(c, R_01(r));                                     \
525                 break;                                                  \
526         case 2:                                                         \
527                 _MULx2(c, R_01(r));                                     \
528                 break;                                                  \
529         default:                                                        \
530                 ZFS_ASM_BUG();                                          \
531         }                                                               \
532 }
533
534 #define raidz_math_begin()      kfpu_begin()
535 #define raidz_math_end()        kfpu_end()
536
537 /* Overkill... */
538 #if defined(_KERNEL)
539 #define GEN_X_DEFINE_0_3()      \
540 register unsigned char w0 asm("v0") __attribute__((vector_size(16)));   \
541 register unsigned char w1 asm("v1") __attribute__((vector_size(16)));   \
542 register unsigned char w2 asm("v2") __attribute__((vector_size(16)));   \
543 register unsigned char w3 asm("v3") __attribute__((vector_size(16)));
544 #define GEN_X_DEFINE_4_5()      \
545 register unsigned char w4 asm("v4") __attribute__((vector_size(16)));   \
546 register unsigned char w5 asm("v5") __attribute__((vector_size(16)));
547 #define GEN_X_DEFINE_6_7()      \
548 register unsigned char w6 asm("v6") __attribute__((vector_size(16)));   \
549 register unsigned char w7 asm("v7") __attribute__((vector_size(16)));
550 #define GEN_X_DEFINE_8_9()      \
551 register unsigned char w8 asm("v8") __attribute__((vector_size(16)));   \
552 register unsigned char w9 asm("v9") __attribute__((vector_size(16)));
553 #define GEN_X_DEFINE_10_11()    \
554 register unsigned char w10 asm("v10") __attribute__((vector_size(16))); \
555 register unsigned char w11 asm("v11") __attribute__((vector_size(16)));
556 #define GEN_X_DEFINE_12_15()    \
557 register unsigned char w12 asm("v12") __attribute__((vector_size(16))); \
558 register unsigned char w13 asm("v13") __attribute__((vector_size(16))); \
559 register unsigned char w14 asm("v14") __attribute__((vector_size(16))); \
560 register unsigned char w15 asm("v15") __attribute__((vector_size(16)));
561 #define GEN_X_DEFINE_16()       \
562 register unsigned char w16 asm("v16") __attribute__((vector_size(16)));
563 #define GEN_X_DEFINE_17()       \
564 register unsigned char w17 asm("v17") __attribute__((vector_size(16)));
565 #define GEN_X_DEFINE_18_21()    \
566 register unsigned char w18 asm("v18") __attribute__((vector_size(16))); \
567 register unsigned char w19 asm("v19") __attribute__((vector_size(16))); \
568 register unsigned char w20 asm("v20") __attribute__((vector_size(16))); \
569 register unsigned char w21 asm("v21") __attribute__((vector_size(16)));
570 #define GEN_X_DEFINE_22_23()    \
571 register unsigned char w22 asm("v22") __attribute__((vector_size(16))); \
572 register unsigned char w23 asm("v23") __attribute__((vector_size(16)));
573 #define GEN_X_DEFINE_24_27()    \
574 register unsigned char w24 asm("v24") __attribute__((vector_size(16))); \
575 register unsigned char w25 asm("v25") __attribute__((vector_size(16))); \
576 register unsigned char w26 asm("v26") __attribute__((vector_size(16))); \
577 register unsigned char w27 asm("v27") __attribute__((vector_size(16)));
578 #define GEN_X_DEFINE_28_30()    \
579 register unsigned char w28 asm("v28") __attribute__((vector_size(16))); \
580 register unsigned char w29 asm("v29") __attribute__((vector_size(16))); \
581 register unsigned char w30 asm("v30") __attribute__((vector_size(16)));
582 #define GEN_X_DEFINE_31()       \
583 register unsigned char w31 asm("v31") __attribute__((vector_size(16)));
584 #define GEN_X_DEFINE_32()       \
585 register unsigned char w32 asm("v31") __attribute__((vector_size(16)));
586 #define GEN_X_DEFINE_33_36()    \
587 register unsigned char w33 asm("v31") __attribute__((vector_size(16))); \
588 register unsigned char w34 asm("v31") __attribute__((vector_size(16))); \
589 register unsigned char w35 asm("v31") __attribute__((vector_size(16))); \
590 register unsigned char w36 asm("v31") __attribute__((vector_size(16)));
591 #define GEN_X_DEFINE_37_38()    \
592 register unsigned char w37 asm("v31") __attribute__((vector_size(16))); \
593 register unsigned char w38 asm("v31") __attribute__((vector_size(16)));
594 #define GEN_X_DEFINE_ALL()      \
595         GEN_X_DEFINE_0_3()      \
596         GEN_X_DEFINE_4_5()      \
597         GEN_X_DEFINE_6_7()      \
598         GEN_X_DEFINE_8_9()      \
599         GEN_X_DEFINE_10_11()    \
600         GEN_X_DEFINE_12_15()    \
601         GEN_X_DEFINE_16()       \
602         GEN_X_DEFINE_17()       \
603         GEN_X_DEFINE_18_21()    \
604         GEN_X_DEFINE_22_23()    \
605         GEN_X_DEFINE_24_27()    \
606         GEN_X_DEFINE_28_30()    \
607         GEN_X_DEFINE_31()       \
608         GEN_X_DEFINE_32()       \
609         GEN_X_DEFINE_33_36()    \
610         GEN_X_DEFINE_37_38()
611 #else
612 #define GEN_X_DEFINE_0_3()      \
613         unsigned char w0 __attribute__((vector_size(16)));      \
614         unsigned char w1 __attribute__((vector_size(16)));      \
615         unsigned char w2 __attribute__((vector_size(16)));      \
616         unsigned char w3 __attribute__((vector_size(16)));
617 #define GEN_X_DEFINE_4_5()      \
618         unsigned char w4 __attribute__((vector_size(16)));      \
619         unsigned char w5 __attribute__((vector_size(16)));
620 #define GEN_X_DEFINE_6_7()      \
621         unsigned char w6 __attribute__((vector_size(16)));      \
622         unsigned char w7 __attribute__((vector_size(16)));
623 #define GEN_X_DEFINE_8_9()      \
624         unsigned char w8 __attribute__((vector_size(16)));      \
625         unsigned char w9 __attribute__((vector_size(16)));
626 #define GEN_X_DEFINE_10_11()    \
627         unsigned char w10 __attribute__((vector_size(16)));     \
628         unsigned char w11 __attribute__((vector_size(16)));
629 #define GEN_X_DEFINE_12_15()    \
630         unsigned char w12 __attribute__((vector_size(16)));     \
631         unsigned char w13 __attribute__((vector_size(16)));     \
632         unsigned char w14 __attribute__((vector_size(16)));     \
633         unsigned char w15 __attribute__((vector_size(16)));
634 #define GEN_X_DEFINE_16()       \
635         unsigned char w16 __attribute__((vector_size(16)));
636 #define GEN_X_DEFINE_17()       \
637         unsigned char w17 __attribute__((vector_size(16)));
638 #define GEN_X_DEFINE_18_21()    \
639         unsigned char w18 __attribute__((vector_size(16)));     \
640         unsigned char w19 __attribute__((vector_size(16)));     \
641         unsigned char w20 __attribute__((vector_size(16)));     \
642         unsigned char w21 __attribute__((vector_size(16)));
643 #define GEN_X_DEFINE_22_23()    \
644         unsigned char w22 __attribute__((vector_size(16)));     \
645         unsigned char w23 __attribute__((vector_size(16)));
646 #define GEN_X_DEFINE_24_27()    \
647         unsigned char w24 __attribute__((vector_size(16)));     \
648         unsigned char w25 __attribute__((vector_size(16)));     \
649         unsigned char w26 __attribute__((vector_size(16)));     \
650         unsigned char w27 __attribute__((vector_size(16)));
651 #define GEN_X_DEFINE_28_30()    \
652         unsigned char w28 __attribute__((vector_size(16)));     \
653         unsigned char w29 __attribute__((vector_size(16)));     \
654         unsigned char w30 __attribute__((vector_size(16)));
655 #define GEN_X_DEFINE_31()       \
656         unsigned char w31 __attribute__((vector_size(16)));
657 #define GEN_X_DEFINE_32()       \
658         unsigned char w32 __attribute__((vector_size(16)));
659 #define GEN_X_DEFINE_33_36()    \
660         unsigned char w33 __attribute__((vector_size(16)));     \
661         unsigned char w34 __attribute__((vector_size(16)));     \
662         unsigned char w35 __attribute__((vector_size(16)));     \
663         unsigned char w36 __attribute__((vector_size(16)));
664 #define GEN_X_DEFINE_37_38()    \
665         unsigned char w37 __attribute__((vector_size(16)));     \
666         unsigned char w38 __attribute__((vector_size(16)));
667 #define GEN_X_DEFINE_ALL()      \
668         GEN_X_DEFINE_0_3()      \
669         GEN_X_DEFINE_4_5()      \
670         GEN_X_DEFINE_6_7()      \
671         GEN_X_DEFINE_8_9()      \
672         GEN_X_DEFINE_10_11()    \
673         GEN_X_DEFINE_12_15()    \
674         GEN_X_DEFINE_16()       \
675         GEN_X_DEFINE_17()       \
676         GEN_X_DEFINE_18_21()    \
677         GEN_X_DEFINE_22_23()    \
678         GEN_X_DEFINE_24_27()    \
679         GEN_X_DEFINE_28_30()    \
680         GEN_X_DEFINE_31()       \
681         GEN_X_DEFINE_32()       \
682         GEN_X_DEFINE_33_36()    \
683         GEN_X_DEFINE_37_38()
684 #endif