1 //===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines all of the R600-specific intrinsics.
12 //===----------------------------------------------------------------------===//
14 class AMDGPUReadPreloadRegisterIntrinsic
15 : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
17 class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
18 : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, GCCBuiltin<name>;
20 // Used to tag image and resource intrinsics with information used to generate
22 class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = 0> {
23 int RsrcArg = rsrcarg;
24 bit IsImage = isimage;
27 let TargetPrefix = "r600" in {
29 multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
30 def _x : AMDGPUReadPreloadRegisterIntrinsic;
31 def _y : AMDGPUReadPreloadRegisterIntrinsic;
32 def _z : AMDGPUReadPreloadRegisterIntrinsic;
35 multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
36 def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>;
37 def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>;
38 def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>;
41 defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
42 <"__builtin_r600_read_global_size">;
43 defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
44 <"__builtin_r600_read_ngroups">;
45 defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
46 <"__builtin_r600_read_tgid">;
48 defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
49 defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
51 def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">,
52 Intrinsic<[], [], [IntrConvergent]>;
54 // AS 7 is PARAM_I_ADDRESS, used for kernel arguments
55 def int_r600_implicitarg_ptr :
56 GCCBuiltin<"__builtin_r600_implicitarg_ptr">,
57 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [],
58 [IntrNoMem, IntrSpeculatable]>;
60 def int_r600_rat_store_typed :
61 // 1st parameter: Data
62 // 2nd parameter: Index
63 // 3rd parameter: Constant RAT ID
64 Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>,
65 GCCBuiltin<"__builtin_r600_rat_store_typed">;
67 def int_r600_recipsqrt_ieee : Intrinsic<
68 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
71 def int_r600_recipsqrt_clamped : Intrinsic<
72 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
75 def int_r600_cube : Intrinsic<
76 [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
79 def int_r600_store_stream_output : Intrinsic<
80 [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
83 class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
84 llvm_v4f32_ty, // Coord
85 llvm_i32_ty, // offset_x
86 llvm_i32_ty, // offset_y,
87 llvm_i32_ty, // offset_z,
88 llvm_i32_ty, // resource_id
89 llvm_i32_ty, // samplerid
90 llvm_i32_ty, // coord_type_x
91 llvm_i32_ty, // coord_type_y
92 llvm_i32_ty, // coord_type_z
93 llvm_i32_ty], // coord_type_w
97 class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
98 llvm_v4i32_ty, // Coord
99 llvm_i32_ty, // offset_x
100 llvm_i32_ty, // offset_y,
101 llvm_i32_ty, // offset_z,
102 llvm_i32_ty, // resource_id
103 llvm_i32_ty, // samplerid
104 llvm_i32_ty, // coord_type_x
105 llvm_i32_ty, // coord_type_y
106 llvm_i32_ty, // coord_type_z
107 llvm_i32_ty], // coord_type_w
111 def int_r600_store_swizzle :
112 Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
115 def int_r600_tex : TextureIntrinsicFloatInput;
116 def int_r600_texc : TextureIntrinsicFloatInput;
117 def int_r600_txl : TextureIntrinsicFloatInput;
118 def int_r600_txlc : TextureIntrinsicFloatInput;
119 def int_r600_txb : TextureIntrinsicFloatInput;
120 def int_r600_txbc : TextureIntrinsicFloatInput;
121 def int_r600_txf : TextureIntrinsicInt32Input;
122 def int_r600_txq : TextureIntrinsicInt32Input;
123 def int_r600_ddx : TextureIntrinsicFloatInput;
124 def int_r600_ddy : TextureIntrinsicFloatInput;
126 def int_r600_dot4 : Intrinsic<[llvm_float_ty],
127 [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
130 def int_r600_kill : Intrinsic<[], [llvm_float_ty], []>;
132 } // End TargetPrefix = "r600"
134 let TargetPrefix = "amdgcn" in {
136 //===----------------------------------------------------------------------===//
137 // ABI Special Intrinsics
138 //===----------------------------------------------------------------------===//
140 defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
141 defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
142 <"__builtin_amdgcn_workgroup_id">;
144 def int_amdgcn_dispatch_ptr :
145 GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">,
146 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
147 [IntrNoMem, IntrSpeculatable]>;
149 def int_amdgcn_queue_ptr :
150 GCCBuiltin<"__builtin_amdgcn_queue_ptr">,
151 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
152 [IntrNoMem, IntrSpeculatable]>;
154 def int_amdgcn_kernarg_segment_ptr :
155 GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
156 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
157 [IntrNoMem, IntrSpeculatable]>;
159 def int_amdgcn_implicitarg_ptr :
160 GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
161 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
162 [IntrNoMem, IntrSpeculatable]>;
164 def int_amdgcn_groupstaticsize :
165 GCCBuiltin<"__builtin_amdgcn_groupstaticsize">,
166 Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
168 def int_amdgcn_dispatch_id :
169 GCCBuiltin<"__builtin_amdgcn_dispatch_id">,
170 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>;
172 def int_amdgcn_implicit_buffer_ptr :
173 GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
174 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
175 [IntrNoMem, IntrSpeculatable]>;
177 // Set EXEC to the 64-bit value given.
178 // This is always moved to the beginning of the basic block.
179 def int_amdgcn_init_exec : Intrinsic<[],
180 [llvm_i64_ty], // 64-bit literal constant
183 // Set EXEC according to a thread count packed in an SGPR input:
184 // thread_count = (input >> bitoffset) & 0x7f;
185 // This is always moved to the beginning of the basic block.
186 def int_amdgcn_init_exec_from_input : Intrinsic<[],
187 [llvm_i32_ty, // 32-bit SGPR input
188 llvm_i32_ty], // bit offset of the thread count
192 //===----------------------------------------------------------------------===//
193 // Instruction Intrinsics
194 //===----------------------------------------------------------------------===//
196 // The first parameter is s_sendmsg immediate (i16),
197 // the second one is copied to m0
198 def int_amdgcn_s_sendmsg : GCCBuiltin<"__builtin_amdgcn_s_sendmsg">,
199 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
200 def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
201 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
203 def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
204 Intrinsic<[], [], [IntrConvergent]>;
206 def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
207 Intrinsic<[], [], [IntrConvergent]>;
209 def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">,
210 Intrinsic<[], [llvm_i32_ty], []>;
212 def int_amdgcn_div_scale : Intrinsic<
213 // 1st parameter: Numerator
214 // 2nd parameter: Denominator
215 // 3rd parameter: Constant to select select between first and
216 // second. (0 = first, 1 = second).
217 [llvm_anyfloat_ty, llvm_i1_ty],
218 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
219 [IntrNoMem, IntrSpeculatable]
222 def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty],
223 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
224 [IntrNoMem, IntrSpeculatable]
227 def int_amdgcn_div_fixup : Intrinsic<[llvm_anyfloat_ty],
228 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
229 [IntrNoMem, IntrSpeculatable]
232 def int_amdgcn_trig_preop : Intrinsic<
233 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
234 [IntrNoMem, IntrSpeculatable]
237 def int_amdgcn_sin : Intrinsic<
238 [llvm_anyfloat_ty], [LLVMMatchType<0>],
239 [IntrNoMem, IntrSpeculatable]
242 def int_amdgcn_cos : Intrinsic<
243 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
246 def int_amdgcn_log_clamp : Intrinsic<
247 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
250 def int_amdgcn_fmul_legacy : GCCBuiltin<"__builtin_amdgcn_fmul_legacy">,
251 Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
252 [IntrNoMem, IntrSpeculatable]
255 def int_amdgcn_rcp : Intrinsic<
256 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
259 def int_amdgcn_rcp_legacy : GCCBuiltin<"__builtin_amdgcn_rcp_legacy">,
260 Intrinsic<[llvm_float_ty], [llvm_float_ty],
261 [IntrNoMem, IntrSpeculatable]
264 def int_amdgcn_rsq : Intrinsic<
265 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
268 def int_amdgcn_rsq_legacy : GCCBuiltin<"__builtin_amdgcn_rsq_legacy">,
270 [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]
273 def int_amdgcn_rsq_clamp : Intrinsic<
274 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>;
276 def int_amdgcn_ldexp : Intrinsic<
277 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
278 [IntrNoMem, IntrSpeculatable]
281 def int_amdgcn_frexp_mant : Intrinsic<
282 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
285 def int_amdgcn_frexp_exp : Intrinsic<
286 [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable]
289 // v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0
290 // and always uses rtz, so is not suitable for implementing the OpenCL
291 // fract function. It should be ok on VI.
292 def int_amdgcn_fract : Intrinsic<
293 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
296 def int_amdgcn_cvt_pkrtz : Intrinsic<
297 [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
298 [IntrNoMem, IntrSpeculatable]
301 def int_amdgcn_cvt_pknorm_i16 : Intrinsic<
302 [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
303 [IntrNoMem, IntrSpeculatable]
306 def int_amdgcn_cvt_pknorm_u16 : Intrinsic<
307 [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
308 [IntrNoMem, IntrSpeculatable]
311 def int_amdgcn_cvt_pk_i16 : Intrinsic<
312 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
313 [IntrNoMem, IntrSpeculatable]
316 def int_amdgcn_cvt_pk_u16 : Intrinsic<
317 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
318 [IntrNoMem, IntrSpeculatable]
321 def int_amdgcn_class : Intrinsic<
322 [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
323 [IntrNoMem, IntrSpeculatable]
326 def int_amdgcn_fmed3 : GCCBuiltin<"__builtin_amdgcn_fmed3">,
327 Intrinsic<[llvm_anyfloat_ty],
328 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
329 [IntrNoMem, IntrSpeculatable]
332 def int_amdgcn_cubeid : GCCBuiltin<"__builtin_amdgcn_cubeid">,
333 Intrinsic<[llvm_float_ty],
334 [llvm_float_ty, llvm_float_ty, llvm_float_ty],
335 [IntrNoMem, IntrSpeculatable]
338 def int_amdgcn_cubema : GCCBuiltin<"__builtin_amdgcn_cubema">,
339 Intrinsic<[llvm_float_ty],
340 [llvm_float_ty, llvm_float_ty, llvm_float_ty],
341 [IntrNoMem, IntrSpeculatable]
344 def int_amdgcn_cubesc : GCCBuiltin<"__builtin_amdgcn_cubesc">,
345 Intrinsic<[llvm_float_ty],
346 [llvm_float_ty, llvm_float_ty, llvm_float_ty],
347 [IntrNoMem, IntrSpeculatable]
350 def int_amdgcn_cubetc : GCCBuiltin<"__builtin_amdgcn_cubetc">,
351 Intrinsic<[llvm_float_ty],
352 [llvm_float_ty, llvm_float_ty, llvm_float_ty],
353 [IntrNoMem, IntrSpeculatable]
356 // v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz
358 def int_amdgcn_sffbh :
359 Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
360 [IntrNoMem, IntrSpeculatable]
363 // v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support.
364 def int_amdgcn_fmad_ftz :
365 Intrinsic<[llvm_anyfloat_ty],
366 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
367 [IntrNoMem, IntrSpeculatable]
370 // Fields should mirror atomicrmw
371 class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
374 llvm_i32_ty, // ordering
375 llvm_i32_ty, // scope
376 llvm_i1_ty], // isVolatile
377 [IntrArgMemOnly, NoCapture<0>], "",
381 def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
382 def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
384 class AMDGPULDSF32Intrin<string clang_builtin> :
385 GCCBuiltin<clang_builtin>,
386 Intrinsic<[llvm_float_ty],
387 [LLVMQualPointerType<llvm_float_ty, 3>,
389 llvm_i32_ty, // ordering
390 llvm_i32_ty, // scope
391 llvm_i1_ty], // isVolatile
392 [IntrArgMemOnly, NoCapture<0>]
395 def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
396 def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
397 def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;
399 } // TargetPrefix = "amdgcn"
401 // New-style image intrinsics
403 //////////////////////////////////////////////////////////////////////////
404 // Dimension-aware image intrinsics framework
405 //////////////////////////////////////////////////////////////////////////
407 // Helper class to represent (type, name) combinations of arguments. The
408 // argument names are explanatory and used as DAG operand names for codegen
410 class AMDGPUArg<LLVMType ty, string name> {
415 // Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...]
416 class makeArgList<list<string> names, LLVMType basety> {
417 list<AMDGPUArg> ret =
418 !listconcat([AMDGPUArg<basety, names[0]>],
419 !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>));
422 // Return arglist, with LLVMMatchType's references shifted by 'shift'.
423 class arglistmatchshift<list<AMDGPUArg> arglist, int shift> {
424 list<AMDGPUArg> ret =
425 !foreach(arg, arglist,
426 !if(!isa<LLVMMatchType>(arg.Type),
427 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>,
432 // Return the concatenation of the given arglists. LLVMMatchType's are adjusted
433 // accordingly, and shifted by an additional 'shift'.
434 class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> {
435 list<AMDGPUArg> ret =
436 !foldl([]<AMDGPUArg>, arglists, lhs, rhs,
439 arglistmatchshift<rhs,
440 !add(shift, !foldl(0, lhs, a, b,
441 !add(a, b.Type.isAny)))>.ret));
444 // Represent texture/image types / dimensionality.
445 class AMDGPUDimProps<string name, list<string> coord_names, list<string> slice_names> {
446 AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME);
447 string Name = name; // e.g. "2darraymsaa"
448 bit DA = 0; // DA bit in MIMG encoding
450 list<AMDGPUArg> CoordSliceArgs =
451 makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret;
452 list<AMDGPUArg> CoordSliceIntArgs =
453 makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret;
454 list<AMDGPUArg> GradientArgs =
455 makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"),
456 !foreach(name, coord_names, "d" # name # "dv")),
457 llvm_anyfloat_ty>.ret;
459 bits<8> NumCoords = !size(CoordSliceArgs);
460 bits<8> NumGradients = !size(GradientArgs);
463 def AMDGPUDim1D : AMDGPUDimProps<"1d", ["s"], []>;
464 def AMDGPUDim2D : AMDGPUDimProps<"2d", ["s", "t"], []>;
465 def AMDGPUDim3D : AMDGPUDimProps<"3d", ["s", "t", "r"], []>;
467 def AMDGPUDimCube : AMDGPUDimProps<"cube", ["s", "t"], ["face"]>;
468 def AMDGPUDim1DArray : AMDGPUDimProps<"1darray", ["s"], ["slice"]>;
469 def AMDGPUDim2DArray : AMDGPUDimProps<"2darray", ["s", "t"], ["slice"]>;
471 def AMDGPUDim2DMsaa : AMDGPUDimProps<"2dmsaa", ["s", "t"], ["fragid"]>;
473 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<"2darraymsaa", ["s", "t"], ["slice", "fragid"]>;
477 list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D,
478 AMDGPUDimCube, AMDGPUDim1DArray,
480 list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa];
481 list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa);
484 // Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof.
485 class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> {
486 string UpperCaseMod = ucmod;
487 string LowerCaseMod = lcmod;
489 // {offset} {bias} {z-compare}
490 list<AMDGPUArg> ExtraAddrArgs = extra_addr;
493 // Name of the {lod} or {clamp} argument that is appended to the coordinates,
495 string LodOrClamp = "";
498 // AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE
499 // AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4
500 defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
501 multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod,
502 list<AMDGPUArg> extra_addr> {
503 def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>;
504 def NAME#lcmod#_o : AMDGPUSampleVariant<
505 ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>;
508 multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod,
509 list<AMDGPUArg> extra_addr> {
510 defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>;
511 defm NAME : AMDGPUSampleHelper_Offset<
512 "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>;
515 multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod,
516 list<AMDGPUArg> extra_addr> {
517 defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>;
518 let LodOrClamp = "clamp" in
519 defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>;
522 defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
523 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
524 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
525 "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
526 let LodOrClamp = "lod" in
527 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>;
528 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>;
531 let Gradients = 1 in {
532 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>;
533 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>;
537 // Helper class to capture the profile of a dimension-aware image intrinsic.
538 // This information is used to generate the intrinsic's type and to inform
539 // codegen pattern matching.
540 class AMDGPUDimProfile<string opmod,
541 AMDGPUDimProps dim> {
542 AMDGPUDimProps Dim = dim;
543 string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod
545 // These are entended to be overwritten by subclasses
548 list<LLVMType> RetTypes = [];
549 list<AMDGPUArg> DataArgs = [];
550 list<AMDGPUArg> ExtraAddrArgs = [];
552 string LodClampMip = "";
554 int NumRetAndDataAnyTypes =
555 !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
558 list<AMDGPUArg> AddrArgs =
559 arglistconcat<[ExtraAddrArgs,
560 !if(Gradients, dim.GradientArgs, []),
561 !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs),
562 !if(!eq(LodClampMip, ""),
564 [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))],
565 NumRetAndDataAnyTypes>.ret;
566 list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type);
567 list<AMDGPUArg> AddrDefaultArgs =
568 !foreach(arg, AddrArgs,
569 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
570 !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type),
572 list<AMDGPUArg> AddrA16Args =
573 !foreach(arg, AddrArgs,
574 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
575 !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type),
579 class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> {
580 let IsSample = base.IsSample;
581 let IsAtomic = base.IsAtomic;
582 let RetTypes = base.RetTypes;
583 let DataArgs = base.DataArgs;
584 let ExtraAddrArgs = base.ExtraAddrArgs;
585 let Gradients = base.Gradients;
586 let LodClampMip = base.LodClampMip;
589 class AMDGPUDimSampleProfile<string opmod,
591 AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
593 let RetTypes = [llvm_anyfloat_ty];
594 let ExtraAddrArgs = sample.ExtraAddrArgs;
595 let Gradients = sample.Gradients;
596 let LodClampMip = sample.LodOrClamp;
599 class AMDGPUDimNoSampleProfile<string opmod,
601 list<LLVMType> retty,
602 list<AMDGPUArg> dataargs,
603 bit Mip = 0> : AMDGPUDimProfile<opmod, dim> {
604 let RetTypes = retty;
605 let DataArgs = dataargs;
606 let LodClampMip = !if(Mip, "mip", "");
609 class AMDGPUDimAtomicProfile<string opmod,
611 list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> {
612 let RetTypes = [llvm_anyint_ty];
613 let DataArgs = dataargs;
617 class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RESINFO", dim> {
618 let RetTypes = [llvm_anyfloat_ty];
620 let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
621 let LodClampMip = "mip";
624 // All dimension-aware intrinsics are derived from this class.
625 class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
626 list<IntrinsicProperty> props,
627 list<SDNodeProperty> sdnodeprops> : Intrinsic<
628 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return
630 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic
631 !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm)
632 P_.AddrTypes, // vaddr(VGPR)
633 [llvm_v8i32_ty], // rsrc(SGPR)
634 !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR)
635 llvm_i1_ty], []), // unorm(imm)
636 [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
637 llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
638 props, "", sdnodeprops>,
639 AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes),
640 !if(P_.IsAtomic, 0, 1)), 1> {
641 AMDGPUDimProfile P = P_;
643 AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
645 let TargetPrefix = "amdgcn";
648 // Marker class for intrinsics with a DMask that determines the returned
650 class AMDGPUImageDMaskIntrinsic;
652 defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
654 //////////////////////////////////////////////////////////////////////////
655 // Load and store intrinsics
656 //////////////////////////////////////////////////////////////////////////
657 multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod,
658 list<LLVMType> retty,
659 list<AMDGPUArg> dataargs,
660 list<IntrinsicProperty> props,
661 list<SDNodeProperty> sdnodeprops,
663 foreach dim = AMDGPUDims.NoMsaa in {
664 def !strconcat(NAME, "_", dim.Name)
665 : AMDGPUImageDimIntrinsic<
666 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
671 multiclass AMDGPUImageDimIntrinsicsAll<string opmod,
672 list<LLVMType> retty,
673 list<AMDGPUArg> dataargs,
674 list<IntrinsicProperty> props,
675 list<SDNodeProperty> sdnodeprops,
677 foreach dim = AMDGPUDims.All in {
678 def !strconcat(NAME, "_", dim.Name)
679 : AMDGPUImageDimIntrinsic<
680 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
685 defm int_amdgcn_image_load
686 : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem],
688 AMDGPUImageDMaskIntrinsic;
689 defm int_amdgcn_image_load_mip
690 : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_anyfloat_ty], [],
691 [IntrReadMem], [SDNPMemOperand], 1>,
692 AMDGPUImageDMaskIntrinsic;
694 defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
695 "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
696 [IntrWriteMem], [SDNPMemOperand]>;
697 defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
698 "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
699 [IntrWriteMem], [SDNPMemOperand], 1>;
701 //////////////////////////////////////////////////////////////////////////
702 // sample and getlod intrinsics
703 //////////////////////////////////////////////////////////////////////////
704 multiclass AMDGPUImageDimSampleDims<string opmod,
705 AMDGPUSampleVariant sample,
707 foreach dim = AMDGPUDims.NoMsaa in {
708 def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
709 AMDGPUDimSampleProfile<opmod, dim, sample>,
710 !if(NoMem, [IntrNoMem], [IntrReadMem]),
711 !if(NoMem, [], [SDNPMemOperand])>;
715 foreach sample = AMDGPUSampleVariants in {
716 defm int_amdgcn_image_sample # sample.LowerCaseMod
717 : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
718 AMDGPUImageDMaskIntrinsic;
721 defm int_amdgcn_image_getlod
722 : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
723 AMDGPUImageDMaskIntrinsic;
725 //////////////////////////////////////////////////////////////////////////
726 // getresinfo intrinsics
727 //////////////////////////////////////////////////////////////////////////
728 foreach dim = AMDGPUDims.All in {
729 def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
730 : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
731 AMDGPUImageDMaskIntrinsic;
734 //////////////////////////////////////////////////////////////////////////
735 // gather4 intrinsics
736 //////////////////////////////////////////////////////////////////////////
737 foreach sample = AMDGPUSampleVariantsNoGradients in {
738 foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in {
739 def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name:
740 AMDGPUImageDimIntrinsic<
741 AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>,
742 [IntrReadMem], [SDNPMemOperand]>;
747 //////////////////////////////////////////////////////////////////////////
749 //////////////////////////////////////////////////////////////////////////
750 defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
751 multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs> {
752 foreach dim = AMDGPUDims.All in {
753 def !strconcat(NAME, "_", dim.Name)
754 : AMDGPUImageDimIntrinsic<
755 AMDGPUDimAtomicProfile<opmod, dim, dataargs>,
756 [], [SDNPMemOperand]>;
760 multiclass AMDGPUImageDimAtomic<string opmod> {
761 defm "" : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">]>;
764 defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
765 defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
766 defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
767 defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
768 defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
769 defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
770 defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
771 defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
772 defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
773 defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
775 // TODO: INC/DEC are weird: they seem to have a vdata argument in hardware,
776 // even though it clearly shouldn't be needed
777 defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">;
778 defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">;
780 defm int_amdgcn_image_atomic_cmpswap :
781 AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
782 AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
785 //////////////////////////////////////////////////////////////////////////
787 //////////////////////////////////////////////////////////////////////////
789 let TargetPrefix = "amdgcn" in {
791 defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
793 class AMDGPUBufferLoad : Intrinsic <
795 [llvm_v4i32_ty, // rsrc(SGPR)
796 llvm_i32_ty, // vindex(VGPR)
797 llvm_i32_ty, // offset(SGPR/VGPR/imm)
798 llvm_i1_ty, // glc(imm)
799 llvm_i1_ty], // slc(imm)
800 [IntrReadMem], "", [SDNPMemOperand]>,
801 AMDGPURsrcIntrinsic<0>;
802 def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
803 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
805 class AMDGPUBufferStore : Intrinsic <
807 [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
808 llvm_v4i32_ty, // rsrc(SGPR)
809 llvm_i32_ty, // vindex(VGPR)
810 llvm_i32_ty, // offset(SGPR/VGPR/imm)
811 llvm_i1_ty, // glc(imm)
812 llvm_i1_ty], // slc(imm)
813 [IntrWriteMem], "", [SDNPMemOperand]>,
814 AMDGPURsrcIntrinsic<1>;
815 def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
816 def int_amdgcn_buffer_store : AMDGPUBufferStore;
818 def int_amdgcn_tbuffer_load : Intrinsic <
819 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
820 [llvm_v4i32_ty, // rsrc(SGPR)
821 llvm_i32_ty, // vindex(VGPR)
822 llvm_i32_ty, // voffset(VGPR)
823 llvm_i32_ty, // soffset(SGPR)
824 llvm_i32_ty, // offset(imm)
825 llvm_i32_ty, // dfmt(imm)
826 llvm_i32_ty, // nfmt(imm)
827 llvm_i1_ty, // glc(imm)
828 llvm_i1_ty], // slc(imm)
829 [IntrReadMem], "", [SDNPMemOperand]>,
830 AMDGPURsrcIntrinsic<0>;
832 def int_amdgcn_tbuffer_store : Intrinsic <
834 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
835 llvm_v4i32_ty, // rsrc(SGPR)
836 llvm_i32_ty, // vindex(VGPR)
837 llvm_i32_ty, // voffset(VGPR)
838 llvm_i32_ty, // soffset(SGPR)
839 llvm_i32_ty, // offset(imm)
840 llvm_i32_ty, // dfmt(imm)
841 llvm_i32_ty, // nfmt(imm)
842 llvm_i1_ty, // glc(imm)
843 llvm_i1_ty], // slc(imm)
844 [IntrWriteMem], "", [SDNPMemOperand]>,
845 AMDGPURsrcIntrinsic<1>;
847 class AMDGPUBufferAtomic : Intrinsic <
849 [llvm_i32_ty, // vdata(VGPR)
850 llvm_v4i32_ty, // rsrc(SGPR)
851 llvm_i32_ty, // vindex(VGPR)
852 llvm_i32_ty, // offset(SGPR/VGPR/imm)
853 llvm_i1_ty], // slc(imm)
854 [], "", [SDNPMemOperand]>,
855 AMDGPURsrcIntrinsic<1, 0>;
856 def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
857 def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
858 def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
859 def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic;
860 def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic;
861 def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic;
862 def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic;
863 def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic;
864 def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic;
865 def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic;
866 def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
868 [llvm_i32_ty, // src(VGPR)
869 llvm_i32_ty, // cmp(VGPR)
870 llvm_v4i32_ty, // rsrc(SGPR)
871 llvm_i32_ty, // vindex(VGPR)
872 llvm_i32_ty, // offset(SGPR/VGPR/imm)
873 llvm_i1_ty], // slc(imm)
874 [], "", [SDNPMemOperand]>,
875 AMDGPURsrcIntrinsic<2, 0>;
877 } // defset AMDGPUBufferIntrinsics
879 // Uses that do not set the done bit should set IntrWriteMem on the
881 def int_amdgcn_exp : Intrinsic <[], [
884 llvm_any_ty, // src0 (f32 or i32)
885 LLVMMatchType<0>, // src1
886 LLVMMatchType<0>, // src2
887 LLVMMatchType<0>, // src3
894 // exp with compr bit set.
895 def int_amdgcn_exp_compr : Intrinsic <[], [
898 llvm_anyvector_ty, // src0 (v2f16 or v2i16)
899 LLVMMatchType<0>, // src1
905 def int_amdgcn_buffer_wbinvl1_sc :
906 GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
907 Intrinsic<[], [], []>;
909 def int_amdgcn_buffer_wbinvl1 :
910 GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
911 Intrinsic<[], [], []>;
913 def int_amdgcn_s_dcache_inv :
914 GCCBuiltin<"__builtin_amdgcn_s_dcache_inv">,
915 Intrinsic<[], [], []>;
917 def int_amdgcn_s_memtime :
918 GCCBuiltin<"__builtin_amdgcn_s_memtime">,
919 Intrinsic<[llvm_i64_ty], [], [IntrReadMem]>;
921 def int_amdgcn_s_sleep :
922 GCCBuiltin<"__builtin_amdgcn_s_sleep">,
923 Intrinsic<[], [llvm_i32_ty], []> {
926 def int_amdgcn_s_incperflevel :
927 GCCBuiltin<"__builtin_amdgcn_s_incperflevel">,
928 Intrinsic<[], [llvm_i32_ty], []> {
931 def int_amdgcn_s_decperflevel :
932 GCCBuiltin<"__builtin_amdgcn_s_decperflevel">,
933 Intrinsic<[], [llvm_i32_ty], []> {
936 def int_amdgcn_s_getreg :
937 GCCBuiltin<"__builtin_amdgcn_s_getreg">,
938 Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
939 [IntrReadMem, IntrSpeculatable]
942 // int_amdgcn_s_getpc is provided to allow a specific style of position
943 // independent code to determine the high part of its address when it is
944 // known (through convention) that the code and any data of interest does
945 // not cross a 4Gb address boundary. Use for any other purpose may not
946 // produce the desired results as optimizations may cause code movement,
947 // especially as we explicitly use IntrNoMem to allow optimizations.
948 def int_amdgcn_s_getpc :
949 GCCBuiltin<"__builtin_amdgcn_s_getpc">,
950 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>;
952 // __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
953 // param values: 0 = P10, 1 = P20, 2 = P0
954 def int_amdgcn_interp_mov :
955 GCCBuiltin<"__builtin_amdgcn_interp_mov">,
956 Intrinsic<[llvm_float_ty],
957 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
958 [IntrNoMem, IntrSpeculatable]>;
960 // __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
961 // This intrinsic reads from lds, but the memory values are constant,
962 // so it behaves like IntrNoMem.
963 def int_amdgcn_interp_p1 :
964 GCCBuiltin<"__builtin_amdgcn_interp_p1">,
965 Intrinsic<[llvm_float_ty],
966 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
967 [IntrNoMem, IntrSpeculatable]>;
969 // __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
970 def int_amdgcn_interp_p2 :
971 GCCBuiltin<"__builtin_amdgcn_interp_p2">,
972 Intrinsic<[llvm_float_ty],
973 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
974 [IntrNoMem, IntrSpeculatable]>;
975 // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
977 // Pixel shaders only: whether the current pixel is live (i.e. not a helper
978 // invocation for derivative computation).
979 def int_amdgcn_ps_live : Intrinsic <
984 def int_amdgcn_mbcnt_lo :
985 GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
986 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
988 def int_amdgcn_mbcnt_hi :
989 GCCBuiltin<"__builtin_amdgcn_mbcnt_hi">,
990 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
992 // llvm.amdgcn.ds.swizzle src offset
993 def int_amdgcn_ds_swizzle :
994 GCCBuiltin<"__builtin_amdgcn_ds_swizzle">,
995 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
997 def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty],
998 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
999 [IntrNoMem, IntrSpeculatable]
1002 def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty],
1003 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
1004 [IntrNoMem, IntrSpeculatable]
1007 def int_amdgcn_lerp :
1008 GCCBuiltin<"__builtin_amdgcn_lerp">,
1009 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1010 [IntrNoMem, IntrSpeculatable]
1013 def int_amdgcn_sad_u8 :
1014 GCCBuiltin<"__builtin_amdgcn_sad_u8">,
1015 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1016 [IntrNoMem, IntrSpeculatable]
1019 def int_amdgcn_msad_u8 :
1020 GCCBuiltin<"__builtin_amdgcn_msad_u8">,
1021 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1022 [IntrNoMem, IntrSpeculatable]
1025 def int_amdgcn_sad_hi_u8 :
1026 GCCBuiltin<"__builtin_amdgcn_sad_hi_u8">,
1027 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1028 [IntrNoMem, IntrSpeculatable]
1031 def int_amdgcn_sad_u16 :
1032 GCCBuiltin<"__builtin_amdgcn_sad_u16">,
1033 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1034 [IntrNoMem, IntrSpeculatable]
1037 def int_amdgcn_qsad_pk_u16_u8 :
1038 GCCBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
1039 Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
1040 [IntrNoMem, IntrSpeculatable]
1043 def int_amdgcn_mqsad_pk_u16_u8 :
1044 GCCBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
1045 Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
1046 [IntrNoMem, IntrSpeculatable]
1049 def int_amdgcn_mqsad_u32_u8 :
1050 GCCBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
1051 Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
1052 [IntrNoMem, IntrSpeculatable]
1055 def int_amdgcn_cvt_pk_u8_f32 :
1056 GCCBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
1057 Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
1058 [IntrNoMem, IntrSpeculatable]
1061 def int_amdgcn_icmp :
1062 Intrinsic<[llvm_i64_ty], [llvm_anyint_ty, LLVMMatchType<0>, llvm_i32_ty],
1063 [IntrNoMem, IntrConvergent]>;
1065 def int_amdgcn_fcmp :
1066 Intrinsic<[llvm_i64_ty], [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_i32_ty],
1067 [IntrNoMem, IntrConvergent]>;
1069 def int_amdgcn_readfirstlane :
1070 GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
1071 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1073 // The lane argument must be uniform across the currently active threads of the
1074 // current wave. Otherwise, the result is undefined.
1075 def int_amdgcn_readlane :
1076 GCCBuiltin<"__builtin_amdgcn_readlane">,
1077 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1079 // The value to write and lane select arguments must be uniform across the
1080 // currently active threads of the current wave. Otherwise, the result is
1082 def int_amdgcn_writelane :
1083 GCCBuiltin<"__builtin_amdgcn_writelane">,
1084 Intrinsic<[llvm_i32_ty], [
1085 llvm_i32_ty, // uniform value to write: returned by the selected lane
1086 llvm_i32_ty, // uniform lane select
1087 llvm_i32_ty // returned by all lanes other than the selected one
1089 [IntrNoMem, IntrConvergent]
1092 def int_amdgcn_alignbit : Intrinsic<[llvm_i32_ty],
1093 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1094 [IntrNoMem, IntrSpeculatable]
1097 def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],
1098 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1099 [IntrNoMem, IntrSpeculatable]
1103 // Copies the source value to the destination value, with the guarantee that
1104 // the source value is computed as if the entire program were executed in WQM.
1105 def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
1106 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
1109 // Return true if at least one thread within the pixel quad passes true into
1111 def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
1112 [llvm_i1_ty], [IntrNoMem, IntrConvergent]
1115 // If false, set EXEC=0 for the current thread until the end of program.
1116 def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
1118 // Copies the active channels of the source value to the destination value,
1119 // with the guarantee that the source value is computed as if the entire
1120 // program were executed in Whole Wavefront Mode, i.e. with all channels
1121 // enabled, with a few exceptions: - Phi nodes with require WWM return an
1123 def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
1124 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
1127 // Given a value, copies it while setting all the inactive lanes to a given
1128 // value. Note that OpenGL helper lanes are considered active, so if the
1129 // program ever uses WQM, then the instruction and the first source will be
1131 def int_amdgcn_set_inactive :
1132 Intrinsic<[llvm_anyint_ty],
1133 [LLVMMatchType<0>, // value to be copied
1134 LLVMMatchType<0>], // value for the inactive lanes to take
1135 [IntrNoMem, IntrConvergent]>;
1137 //===----------------------------------------------------------------------===//
1139 //===----------------------------------------------------------------------===//
1141 def int_amdgcn_s_dcache_inv_vol :
1142 GCCBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
1143 Intrinsic<[], [], []>;
1145 def int_amdgcn_buffer_wbinvl1_vol :
1146 GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
1147 Intrinsic<[], [], []>;
1149 //===----------------------------------------------------------------------===//
1151 //===----------------------------------------------------------------------===//
1153 // llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1154 def int_amdgcn_mov_dpp :
1155 Intrinsic<[llvm_anyint_ty],
1156 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
1157 llvm_i1_ty], [IntrNoMem, IntrConvergent]>;
1159 // llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1160 // Should be equivalent to:
1161 // v_mov_b32 <dest> <old>
1162 // v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1163 def int_amdgcn_update_dpp :
1164 Intrinsic<[llvm_anyint_ty],
1165 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
1166 llvm_i32_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent]>;
1168 def int_amdgcn_s_dcache_wb :
1169 GCCBuiltin<"__builtin_amdgcn_s_dcache_wb">,
1170 Intrinsic<[], [], []>;
1172 def int_amdgcn_s_dcache_wb_vol :
1173 GCCBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
1174 Intrinsic<[], [], []>;
1176 def int_amdgcn_s_memrealtime :
1177 GCCBuiltin<"__builtin_amdgcn_s_memrealtime">,
1178 Intrinsic<[llvm_i64_ty], [], [IntrReadMem]>;
1180 // llvm.amdgcn.ds.permute <index> <src>
1181 def int_amdgcn_ds_permute :
1182 GCCBuiltin<"__builtin_amdgcn_ds_permute">,
1183 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1185 // llvm.amdgcn.ds.bpermute <index> <src>
1186 def int_amdgcn_ds_bpermute :
1187 GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
1188 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1190 //===----------------------------------------------------------------------===//
1191 // Deep learning intrinsics.
1192 //===----------------------------------------------------------------------===//
1194 // f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
1195 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1196 def int_amdgcn_fdot2 :
1197 GCCBuiltin<"__builtin_amdgcn_fdot2">,
1199 [llvm_float_ty], // %r
1201 llvm_v2f16_ty, // %a
1202 llvm_v2f16_ty, // %b
1203 llvm_float_ty, // %c
1204 llvm_i1_ty // %clamp
1206 [IntrNoMem, IntrSpeculatable]
1209 // i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
1210 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1211 def int_amdgcn_sdot2 :
1212 GCCBuiltin<"__builtin_amdgcn_sdot2">,
1214 [llvm_i32_ty], // %r
1216 llvm_v2i16_ty, // %a
1217 llvm_v2i16_ty, // %b
1219 llvm_i1_ty // %clamp
1221 [IntrNoMem, IntrSpeculatable]
1224 // u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
1225 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1226 def int_amdgcn_udot2 :
1227 GCCBuiltin<"__builtin_amdgcn_udot2">,
1229 [llvm_i32_ty], // %r
1231 llvm_v2i16_ty, // %a
1232 llvm_v2i16_ty, // %b
1234 llvm_i1_ty // %clamp
1236 [IntrNoMem, IntrSpeculatable]
1239 // i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
1240 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
1241 def int_amdgcn_sdot4 :
1242 GCCBuiltin<"__builtin_amdgcn_sdot4">,
1244 [llvm_i32_ty], // %r
1249 llvm_i1_ty // %clamp
1251 [IntrNoMem, IntrSpeculatable]
1254 // u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
1255 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
1256 def int_amdgcn_udot4 :
1257 GCCBuiltin<"__builtin_amdgcn_udot4">,
1259 [llvm_i32_ty], // %r
1264 llvm_i1_ty // %clamp
1266 [IntrNoMem, IntrSpeculatable]
1269 // i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
1270 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
1271 // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
1272 def int_amdgcn_sdot8 :
1273 GCCBuiltin<"__builtin_amdgcn_sdot8">,
1275 [llvm_i32_ty], // %r
1280 llvm_i1_ty // %clamp
1282 [IntrNoMem, IntrSpeculatable]
1285 // u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
1286 // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
1287 // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
1288 def int_amdgcn_udot8 :
1289 GCCBuiltin<"__builtin_amdgcn_udot8">,
1291 [llvm_i32_ty], // %r
1296 llvm_i1_ty // %clamp
1298 [IntrNoMem, IntrSpeculatable]
1301 //===----------------------------------------------------------------------===//
1302 // Special Intrinsics for backend internal use only. No frontend
1303 // should emit calls to these.
1304 // ===----------------------------------------------------------------------===//
1305 def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
1306 [llvm_i1_ty], [IntrConvergent]
1309 def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
1310 [llvm_i64_ty], [IntrConvergent]
1313 def int_amdgcn_break : Intrinsic<[llvm_i64_ty],
1314 [llvm_i64_ty], [IntrNoMem, IntrConvergent]
1317 def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty],
1318 [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
1321 def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty],
1322 [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
1325 def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
1326 [llvm_i64_ty], [IntrConvergent]
1329 def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
1331 // Represent unreachable in a divergent region.
1332 def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>;
1334 // Emit 2.5 ulp, no denormal division. Should only be inserted by
1335 // pass based on !fpmath metadata.
1336 def int_amdgcn_fdiv_fast : Intrinsic<
1337 [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
1338 [IntrNoMem, IntrSpeculatable]