1 //===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file describes the PTX instructions in TableGen format.
12 //===----------------------------------------------------------------------===//
14 include "NVPTXInstrFormats.td"
17 let hasSideEffects = 0 in {
18 def NOP : NVPTXInst<(outs), (ins), "", []>;
21 // List of vector specific properties
22 def isVecLD : VecInstTypeEnum<1>;
23 def isVecST : VecInstTypeEnum<2>;
24 def isVecBuild : VecInstTypeEnum<3>;
25 def isVecShuffle : VecInstTypeEnum<4>;
26 def isVecExtract : VecInstTypeEnum<5>;
27 def isVecInsert : VecInstTypeEnum<6>;
28 def isVecDest : VecInstTypeEnum<7>;
29 def isVecOther : VecInstTypeEnum<15>;
31 //===----------------------------------------------------------------------===//
32 // NVPTX Operand Definitions.
33 //===----------------------------------------------------------------------===//
35 def brtarget : Operand<OtherVT>;
37 // CVT conversion modes
38 // These must match the enum in NVPTX.h
39 def CvtNONE : PatLeaf<(i32 0x0)>;
40 def CvtRNI : PatLeaf<(i32 0x1)>;
41 def CvtRZI : PatLeaf<(i32 0x2)>;
42 def CvtRMI : PatLeaf<(i32 0x3)>;
43 def CvtRPI : PatLeaf<(i32 0x4)>;
44 def CvtRN : PatLeaf<(i32 0x5)>;
45 def CvtRZ : PatLeaf<(i32 0x6)>;
46 def CvtRM : PatLeaf<(i32 0x7)>;
47 def CvtRP : PatLeaf<(i32 0x8)>;
49 def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
50 def CvtRNI_FTZ : PatLeaf<(i32 0x11)>;
51 def CvtRZI_FTZ : PatLeaf<(i32 0x12)>;
52 def CvtRMI_FTZ : PatLeaf<(i32 0x13)>;
53 def CvtRPI_FTZ : PatLeaf<(i32 0x14)>;
54 def CvtRN_FTZ : PatLeaf<(i32 0x15)>;
55 def CvtRZ_FTZ : PatLeaf<(i32 0x16)>;
56 def CvtRM_FTZ : PatLeaf<(i32 0x17)>;
57 def CvtRP_FTZ : PatLeaf<(i32 0x18)>;
59 def CvtSAT : PatLeaf<(i32 0x20)>;
60 def CvtSAT_FTZ : PatLeaf<(i32 0x30)>;
62 def CvtMode : Operand<i32> {
63 let PrintMethod = "printCvtMode";
67 // These must match the enum in NVPTX.h
68 def CmpEQ : PatLeaf<(i32 0)>;
69 def CmpNE : PatLeaf<(i32 1)>;
70 def CmpLT : PatLeaf<(i32 2)>;
71 def CmpLE : PatLeaf<(i32 3)>;
72 def CmpGT : PatLeaf<(i32 4)>;
73 def CmpGE : PatLeaf<(i32 5)>;
74 def CmpEQU : PatLeaf<(i32 10)>;
75 def CmpNEU : PatLeaf<(i32 11)>;
76 def CmpLTU : PatLeaf<(i32 12)>;
77 def CmpLEU : PatLeaf<(i32 13)>;
78 def CmpGTU : PatLeaf<(i32 14)>;
79 def CmpGEU : PatLeaf<(i32 15)>;
80 def CmpNUM : PatLeaf<(i32 16)>;
81 def CmpNAN : PatLeaf<(i32 17)>;
83 def CmpEQ_FTZ : PatLeaf<(i32 0x100)>;
84 def CmpNE_FTZ : PatLeaf<(i32 0x101)>;
85 def CmpLT_FTZ : PatLeaf<(i32 0x102)>;
86 def CmpLE_FTZ : PatLeaf<(i32 0x103)>;
87 def CmpGT_FTZ : PatLeaf<(i32 0x104)>;
88 def CmpGE_FTZ : PatLeaf<(i32 0x105)>;
89 def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>;
90 def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>;
91 def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>;
92 def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>;
93 def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>;
94 def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>;
95 def CmpNUM_FTZ : PatLeaf<(i32 0x110)>;
96 def CmpNAN_FTZ : PatLeaf<(i32 0x111)>;
98 def CmpMode : Operand<i32> {
99 let PrintMethod = "printCmpMode";
102 //===----------------------------------------------------------------------===//
103 // NVPTX Instruction Predicate Definitions
104 //===----------------------------------------------------------------------===//
107 def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
108 def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
109 def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
110 def useAtomRedG32forGen32 :
111 Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
112 def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
113 def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
114 def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
115 def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
116 def useAtomRedG64forGen64 :
117 Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
118 def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
119 def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
120 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
121 def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
122 def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
123 def hasVote : Predicate<"Subtarget->hasVote()">;
124 def hasDouble : Predicate<"Subtarget->hasDouble()">;
125 def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
126 def hasLDG : Predicate<"Subtarget->hasLDG()">;
127 def hasLDU : Predicate<"Subtarget->hasLDU()">;
128 def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
130 def doF32FTZ : Predicate<"useF32FTZ()">;
131 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
133 def doMulWide : Predicate<"doMulWide">;
135 def allowFMA : Predicate<"allowFMA()">;
136 def noFMA : Predicate<"!allowFMA()">;
138 def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
139 def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
141 def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
142 def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
144 def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
145 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
147 def true : Predicate<"true">;
149 def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
152 //===----------------------------------------------------------------------===//
153 // Some Common Instruction Class Templates
154 //===----------------------------------------------------------------------===//
156 // Template for instructions which take three int64, int32, or int16 args.
157 // The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
158 multiclass I3<string OpcStr, SDNode OpNode> {
160 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
161 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
162 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
164 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
165 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
166 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
168 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
169 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
170 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
172 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
173 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
174 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
176 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
177 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
178 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
180 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
181 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
182 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
185 // Template for instructions which take 3 int32 args. The instructions are
186 // named "<OpcStr>.s32" (e.g. "addc.cc.s32").
187 multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
189 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
190 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
191 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
193 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
194 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
195 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
198 // Template for instructions which take three fp64 or fp32 args. The
199 // instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
201 // Also defines ftz (flush subnormal inputs and results to sign-preserving
202 // zero) variants for fp32 functions.
204 // This multiclass should be used for nodes that cannot be folded into FMAs.
205 // For nodes that can be folded into FMAs (i.e. adds and muls), use
207 multiclass F3<string OpcStr, SDNode OpNode> {
209 NVPTXInst<(outs Float64Regs:$dst),
210 (ins Float64Regs:$a, Float64Regs:$b),
211 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
212 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
214 NVPTXInst<(outs Float64Regs:$dst),
215 (ins Float64Regs:$a, f64imm:$b),
216 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
217 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
219 NVPTXInst<(outs Float32Regs:$dst),
220 (ins Float32Regs:$a, Float32Regs:$b),
221 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
222 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
223 Requires<[doF32FTZ]>;
225 NVPTXInst<(outs Float32Regs:$dst),
226 (ins Float32Regs:$a, f32imm:$b),
227 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
228 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
229 Requires<[doF32FTZ]>;
231 NVPTXInst<(outs Float32Regs:$dst),
232 (ins Float32Regs:$a, Float32Regs:$b),
233 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
234 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
236 NVPTXInst<(outs Float32Regs:$dst),
237 (ins Float32Regs:$a, f32imm:$b),
238 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
239 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
242 // Template for instructions which take three fp64 or fp32 args. The
243 // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
245 // Also defines ftz (flush subnormal inputs and results to sign-preserving
246 // zero) variants for fp32 functions.
248 // This multiclass should be used for nodes that can be folded to make fma ops.
249 // In this case, we use the ".rn" variant when FMA is disabled, as this behaves
250 // just like the non ".rn" op, but prevents ptxas from creating FMAs.
251 multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
253 NVPTXInst<(outs Float64Regs:$dst),
254 (ins Float64Regs:$a, Float64Regs:$b),
255 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
256 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
257 Requires<[allowFMA]>;
259 NVPTXInst<(outs Float64Regs:$dst),
260 (ins Float64Regs:$a, f64imm:$b),
261 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
262 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
263 Requires<[allowFMA]>;
265 NVPTXInst<(outs Float32Regs:$dst),
266 (ins Float32Regs:$a, Float32Regs:$b),
267 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
268 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
269 Requires<[allowFMA, doF32FTZ]>;
271 NVPTXInst<(outs Float32Regs:$dst),
272 (ins Float32Regs:$a, f32imm:$b),
273 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
274 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
275 Requires<[allowFMA, doF32FTZ]>;
277 NVPTXInst<(outs Float32Regs:$dst),
278 (ins Float32Regs:$a, Float32Regs:$b),
279 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
280 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
281 Requires<[allowFMA]>;
283 NVPTXInst<(outs Float32Regs:$dst),
284 (ins Float32Regs:$a, f32imm:$b),
285 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
286 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
287 Requires<[allowFMA]>;
289 // These have strange names so we don't perturb existing mir tests.
291 NVPTXInst<(outs Float64Regs:$dst),
292 (ins Float64Regs:$a, Float64Regs:$b),
293 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
294 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
297 NVPTXInst<(outs Float64Regs:$dst),
298 (ins Float64Regs:$a, f64imm:$b),
299 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
300 [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
303 NVPTXInst<(outs Float32Regs:$dst),
304 (ins Float32Regs:$a, Float32Regs:$b),
305 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
306 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
307 Requires<[noFMA, doF32FTZ]>;
309 NVPTXInst<(outs Float32Regs:$dst),
310 (ins Float32Regs:$a, f32imm:$b),
311 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
312 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
313 Requires<[noFMA, doF32FTZ]>;
315 NVPTXInst<(outs Float32Regs:$dst),
316 (ins Float32Regs:$a, Float32Regs:$b),
317 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
318 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
321 NVPTXInst<(outs Float32Regs:$dst),
322 (ins Float32Regs:$a, f32imm:$b),
323 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
324 [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
328 // Template for operations which take two f32 or f64 operands. Provides three
329 // instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
330 // subnormal inputs and results to zero).
331 multiclass F2<string OpcStr, SDNode OpNode> {
332 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
333 !strconcat(OpcStr, ".f64 \t$dst, $a;"),
334 [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
335 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
336 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
337 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
338 Requires<[doF32FTZ]>;
339 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
340 !strconcat(OpcStr, ".f32 \t$dst, $a;"),
341 [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
344 //===----------------------------------------------------------------------===//
345 // NVPTX Instructions.
346 //===----------------------------------------------------------------------===//
348 //-----------------------------------
350 //-----------------------------------
352 let hasSideEffects = 0 in {
353 // Generate a cvt to the given type from all possible types. Each instance
354 // takes a CvtMode immediate that defines the conversion mode to use. It can
355 // be CvtNONE to omit a conversion mode.
356 multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
358 NVPTXInst<(outs RC:$dst),
359 (ins Int16Regs:$src, CvtMode:$mode),
360 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
361 FromName, ".s8\t$dst, $src;"), []>;
363 NVPTXInst<(outs RC:$dst),
364 (ins Int16Regs:$src, CvtMode:$mode),
365 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
366 FromName, ".u8\t$dst, $src;"), []>;
368 NVPTXInst<(outs RC:$dst),
369 (ins Int16Regs:$src, CvtMode:$mode),
370 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
371 FromName, ".s16\t$dst, $src;"), []>;
373 NVPTXInst<(outs RC:$dst),
374 (ins Int16Regs:$src, CvtMode:$mode),
375 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
376 FromName, ".u16\t$dst, $src;"), []>;
378 NVPTXInst<(outs RC:$dst),
379 (ins Int16Regs:$src, CvtMode:$mode),
380 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
381 FromName, ".f16\t$dst, $src;"), []>;
383 NVPTXInst<(outs RC:$dst),
384 (ins Int32Regs:$src, CvtMode:$mode),
385 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
386 FromName, ".s32\t$dst, $src;"), []>;
388 NVPTXInst<(outs RC:$dst),
389 (ins Int32Regs:$src, CvtMode:$mode),
390 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
391 FromName, ".u32\t$dst, $src;"), []>;
393 NVPTXInst<(outs RC:$dst),
394 (ins Int64Regs:$src, CvtMode:$mode),
395 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
396 FromName, ".s64\t$dst, $src;"), []>;
398 NVPTXInst<(outs RC:$dst),
399 (ins Int64Regs:$src, CvtMode:$mode),
400 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
401 FromName, ".u64\t$dst, $src;"), []>;
403 NVPTXInst<(outs RC:$dst),
404 (ins Float32Regs:$src, CvtMode:$mode),
405 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
406 FromName, ".f32\t$dst, $src;"), []>;
408 NVPTXInst<(outs RC:$dst),
409 (ins Float64Regs:$src, CvtMode:$mode),
410 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
411 FromName, ".f64\t$dst, $src;"), []>;
414 // Generate cvts from all types to all types.
415 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>;
416 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>;
417 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
418 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
419 defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
420 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
421 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
422 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
423 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
424 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
425 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
427 // These cvts are different from those above: The source and dest registers
428 // are of the same type.
429 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
430 "cvt.s16.s8 \t$dst, $src;", []>;
431 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
432 "cvt.s32.s8 \t$dst, $src;", []>;
433 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
434 "cvt.s32.s16 \t$dst, $src;", []>;
435 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
436 "cvt.s64.s8 \t$dst, $src;", []>;
437 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
438 "cvt.s64.s16 \t$dst, $src;", []>;
439 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
440 "cvt.s64.s32 \t$dst, $src;", []>;
443 //-----------------------------------
444 // Integer Arithmetic
445 //-----------------------------------
447 // Template for xor masquerading as int1 arithmetic.
448 multiclass ADD_SUB_i1<SDNode OpNode> {
449 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
450 "xor.pred \t$dst, $a, $b;",
451 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
452 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
453 "xor.pred \t$dst, $a, $b;",
454 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
457 // int1 addition and subtraction are both just xor.
458 defm ADD_i1 : ADD_SUB_i1<add>;
459 defm SUB_i1 : ADD_SUB_i1<sub>;
461 // int16, int32, and int64 signed addition. Since nvptx is 2's compliment, we
462 // also use these for unsigned arithmetic.
463 defm ADD : I3<"add.s", add>;
464 defm SUB : I3<"sub.s", sub>;
466 // int32 addition and subtraction with carry-out.
467 // FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
468 defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
469 defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
471 // int32 addition and subtraction with carry-in and carry-out.
472 defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
473 defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
475 defm MULT : I3<"mul.lo.s", mul>;
477 defm MULTHS : I3<"mul.hi.s", mulhs>;
478 defm MULTHU : I3<"mul.hi.u", mulhu>;
480 defm SDIV : I3<"div.s", sdiv>;
481 defm UDIV : I3<"div.u", udiv>;
483 // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
485 defm SREM : I3<"rem.s", srem>;
486 defm UREM : I3<"rem.u", urem>;
490 // Wide multiplication
493 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
494 "mul.wide.s32 \t$dst, $a, $b;", []>;
496 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
497 "mul.wide.s32 \t$dst, $a, $b;", []>;
498 def MULWIDES64Imm64 :
499 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
500 "mul.wide.s32 \t$dst, $a, $b;", []>;
503 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
504 "mul.wide.u32 \t$dst, $a, $b;", []>;
506 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
507 "mul.wide.u32 \t$dst, $a, $b;", []>;
508 def MULWIDEU64Imm64 :
509 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
510 "mul.wide.u32 \t$dst, $a, $b;", []>;
513 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
514 "mul.wide.s16 \t$dst, $a, $b;", []>;
516 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
517 "mul.wide.s16 \t$dst, $a, $b;", []>;
518 def MULWIDES32Imm32 :
519 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
520 "mul.wide.s16 \t$dst, $a, $b;", []>;
523 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
524 "mul.wide.u16 \t$dst, $a, $b;", []>;
526 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
527 "mul.wide.u16 \t$dst, $a, $b;", []>;
528 def MULWIDEU32Imm32 :
529 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
530 "mul.wide.u16 \t$dst, $a, $b;", []>;
532 def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
533 def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
534 def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
536 // Matchers for signed, unsigned mul.wide ISD nodes.
537 def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
538 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
539 Requires<[doMulWide]>;
540 def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
541 (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
542 Requires<[doMulWide]>;
543 def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
544 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
545 Requires<[doMulWide]>;
546 def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
547 (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
548 Requires<[doMulWide]>;
550 def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
551 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
552 Requires<[doMulWide]>;
553 def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
554 (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
555 Requires<[doMulWide]>;
556 def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
557 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
558 Requires<[doMulWide]>;
559 def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
560 (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
561 Requires<[doMulWide]>;
563 // Predicates used for converting some patterns to mul.wide.
564 def SInt32Const : PatLeaf<(imm), [{
565 const APInt &v = N->getAPIntValue();
566 return v.isSignedIntN(32);
569 def UInt32Const : PatLeaf<(imm), [{
570 const APInt &v = N->getAPIntValue();
574 def SInt16Const : PatLeaf<(imm), [{
575 const APInt &v = N->getAPIntValue();
576 return v.isSignedIntN(16);
579 def UInt16Const : PatLeaf<(imm), [{
580 const APInt &v = N->getAPIntValue();
584 def Int5Const : PatLeaf<(imm), [{
585 // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
586 const APInt &v = N->getAPIntValue();
587 return v.sge(0) && v.slt(32);
590 def Int4Const : PatLeaf<(imm), [{
591 // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
592 const APInt &v = N->getAPIntValue();
593 return v.sge(0) && v.slt(16);
596 def SHL2MUL32 : SDNodeXForm<imm, [{
597 const APInt &v = N->getAPIntValue();
599 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
602 def SHL2MUL16 : SDNodeXForm<imm, [{
603 const APInt &v = N->getAPIntValue();
605 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
608 // Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
609 def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
610 (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
611 Requires<[doMulWide]>;
612 def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
613 (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
614 Requires<[doMulWide]>;
616 def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
617 (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
618 Requires<[doMulWide]>;
619 def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
620 (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
621 Requires<[doMulWide]>;
623 // Convert "sign/zero-extend then multiply" to mul.wide.
624 def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
625 (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
626 Requires<[doMulWide]>;
627 def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
628 (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
629 Requires<[doMulWide]>;
631 def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
632 (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
633 Requires<[doMulWide]>;
634 def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
635 (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
636 Requires<[doMulWide]>;
638 def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
639 (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
640 Requires<[doMulWide]>;
641 def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
642 (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
643 Requires<[doMulWide]>;
645 def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
646 (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
647 Requires<[doMulWide]>;
648 def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
649 (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
650 Requires<[doMulWide]>;
653 // Integer multiply-add
656 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
657 SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
658 def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
661 NVPTXInst<(outs Int16Regs:$dst),
662 (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
663 "mad.lo.s16 \t$dst, $a, $b, $c;",
664 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
666 NVPTXInst<(outs Int16Regs:$dst),
667 (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
668 "mad.lo.s16 \t$dst, $a, $b, $c;",
669 [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
671 NVPTXInst<(outs Int16Regs:$dst),
672 (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
673 "mad.lo.s16 \t$dst, $a, $b, $c;",
674 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
676 NVPTXInst<(outs Int16Regs:$dst),
677 (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
678 "mad.lo.s16 \t$dst, $a, $b, $c;",
679 [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
682 NVPTXInst<(outs Int32Regs:$dst),
683 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
684 "mad.lo.s32 \t$dst, $a, $b, $c;",
685 [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
687 NVPTXInst<(outs Int32Regs:$dst),
688 (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
689 "mad.lo.s32 \t$dst, $a, $b, $c;",
690 [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
692 NVPTXInst<(outs Int32Regs:$dst),
693 (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
694 "mad.lo.s32 \t$dst, $a, $b, $c;",
695 [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
697 NVPTXInst<(outs Int32Regs:$dst),
698 (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
699 "mad.lo.s32 \t$dst, $a, $b, $c;",
700 [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
703 NVPTXInst<(outs Int64Regs:$dst),
704 (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
705 "mad.lo.s64 \t$dst, $a, $b, $c;",
706 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
708 NVPTXInst<(outs Int64Regs:$dst),
709 (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
710 "mad.lo.s64 \t$dst, $a, $b, $c;",
711 [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
713 NVPTXInst<(outs Int64Regs:$dst),
714 (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
715 "mad.lo.s64 \t$dst, $a, $b, $c;",
716 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
718 NVPTXInst<(outs Int64Regs:$dst),
719 (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
720 "mad.lo.s64 \t$dst, $a, $b, $c;",
721 [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
724 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
725 "neg.s16 \t$dst, $src;",
726 [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
728 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
729 "neg.s32 \t$dst, $src;",
730 [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
732 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
733 "neg.s64 \t$dst, $src;",
734 [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
736 //-----------------------------------
737 // Floating Point Arithmetic
738 //-----------------------------------
741 def FloatConst1 : PatLeaf<(fpimm), [{
742 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
743 N->getValueAPF().convertToFloat() == 1.0f;
745 // Constant 1.0 (double)
746 def DoubleConst1 : PatLeaf<(fpimm), [{
747 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
748 N->getValueAPF().convertToDouble() == 1.0;
751 defm FADD : F3_fma_component<"add", fadd>;
752 defm FSUB : F3_fma_component<"sub", fsub>;
753 defm FMUL : F3_fma_component<"mul", fmul>;
755 defm FMIN : F3<"min", fminnum>;
756 defm FMAX : F3<"max", fmaxnum>;
758 defm FABS : F2<"abs", fabs>;
759 defm FNEG : F2<"neg", fneg>;
760 defm FSQRT : F2<"sqrt.rn", fsqrt>;
766 NVPTXInst<(outs Float64Regs:$dst),
767 (ins f64imm:$a, Float64Regs:$b),
768 "rcp.rn.f64 \t$dst, $b;",
769 [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
771 NVPTXInst<(outs Float64Regs:$dst),
772 (ins Float64Regs:$a, Float64Regs:$b),
773 "div.rn.f64 \t$dst, $a, $b;",
774 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
776 NVPTXInst<(outs Float64Regs:$dst),
777 (ins Float64Regs:$a, f64imm:$b),
778 "div.rn.f64 \t$dst, $a, $b;",
779 [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
782 // F32 Approximate reciprocal
785 NVPTXInst<(outs Float32Regs:$dst),
786 (ins f32imm:$a, Float32Regs:$b),
787 "rcp.approx.ftz.f32 \t$dst, $b;",
788 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
789 Requires<[do_DIVF32_APPROX, doF32FTZ]>;
791 NVPTXInst<(outs Float32Regs:$dst),
792 (ins f32imm:$a, Float32Regs:$b),
793 "rcp.approx.f32 \t$dst, $b;",
794 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
795 Requires<[do_DIVF32_APPROX]>;
797 // F32 Approximate division
799 def FDIV32approxrr_ftz :
800 NVPTXInst<(outs Float32Regs:$dst),
801 (ins Float32Regs:$a, Float32Regs:$b),
802 "div.approx.ftz.f32 \t$dst, $a, $b;",
803 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
804 Requires<[do_DIVF32_APPROX, doF32FTZ]>;
805 def FDIV32approxri_ftz :
806 NVPTXInst<(outs Float32Regs:$dst),
807 (ins Float32Regs:$a, f32imm:$b),
808 "div.approx.ftz.f32 \t$dst, $a, $b;",
809 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
810 Requires<[do_DIVF32_APPROX, doF32FTZ]>;
812 NVPTXInst<(outs Float32Regs:$dst),
813 (ins Float32Regs:$a, Float32Regs:$b),
814 "div.approx.f32 \t$dst, $a, $b;",
815 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
816 Requires<[do_DIVF32_APPROX]>;
818 NVPTXInst<(outs Float32Regs:$dst),
819 (ins Float32Regs:$a, f32imm:$b),
820 "div.approx.f32 \t$dst, $a, $b;",
821 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
822 Requires<[do_DIVF32_APPROX]>;
824 // F32 Semi-accurate reciprocal
826 // rcp.approx gives the same result as div.full(1.0f, a) and is faster.
828 def FDIV321r_approx_ftz :
829 NVPTXInst<(outs Float32Regs:$dst),
830 (ins f32imm:$a, Float32Regs:$b),
831 "rcp.approx.ftz.f32 \t$dst, $b;",
832 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
833 Requires<[do_DIVF32_FULL, doF32FTZ]>;
834 def FDIV321r_approx :
835 NVPTXInst<(outs Float32Regs:$dst),
836 (ins f32imm:$a, Float32Regs:$b),
837 "rcp.approx.f32 \t$dst, $b;",
838 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
839 Requires<[do_DIVF32_FULL]>;
841 // F32 Semi-accurate division
844 NVPTXInst<(outs Float32Regs:$dst),
845 (ins Float32Regs:$a, Float32Regs:$b),
846 "div.full.ftz.f32 \t$dst, $a, $b;",
847 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
848 Requires<[do_DIVF32_FULL, doF32FTZ]>;
850 NVPTXInst<(outs Float32Regs:$dst),
851 (ins Float32Regs:$a, f32imm:$b),
852 "div.full.ftz.f32 \t$dst, $a, $b;",
853 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
854 Requires<[do_DIVF32_FULL, doF32FTZ]>;
856 NVPTXInst<(outs Float32Regs:$dst),
857 (ins Float32Regs:$a, Float32Regs:$b),
858 "div.full.f32 \t$dst, $a, $b;",
859 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
860 Requires<[do_DIVF32_FULL]>;
862 NVPTXInst<(outs Float32Regs:$dst),
863 (ins Float32Regs:$a, f32imm:$b),
864 "div.full.f32 \t$dst, $a, $b;",
865 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
866 Requires<[do_DIVF32_FULL]>;
868 // F32 Accurate reciprocal
870 def FDIV321r_prec_ftz :
871 NVPTXInst<(outs Float32Regs:$dst),
872 (ins f32imm:$a, Float32Regs:$b),
873 "rcp.rn.ftz.f32 \t$dst, $b;",
874 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
875 Requires<[reqPTX20, doF32FTZ]>;
877 NVPTXInst<(outs Float32Regs:$dst),
878 (ins f32imm:$a, Float32Regs:$b),
879 "rcp.rn.f32 \t$dst, $b;",
880 [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
881 Requires<[reqPTX20]>;
883 // F32 Accurate division
885 def FDIV32rr_prec_ftz :
886 NVPTXInst<(outs Float32Regs:$dst),
887 (ins Float32Regs:$a, Float32Regs:$b),
888 "div.rn.ftz.f32 \t$dst, $a, $b;",
889 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
890 Requires<[doF32FTZ, reqPTX20]>;
891 def FDIV32ri_prec_ftz :
892 NVPTXInst<(outs Float32Regs:$dst),
893 (ins Float32Regs:$a, f32imm:$b),
894 "div.rn.ftz.f32 \t$dst, $a, $b;",
895 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
896 Requires<[doF32FTZ, reqPTX20]>;
898 NVPTXInst<(outs Float32Regs:$dst),
899 (ins Float32Regs:$a, Float32Regs:$b),
900 "div.rn.f32 \t$dst, $a, $b;",
901 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
902 Requires<[reqPTX20]>;
904 NVPTXInst<(outs Float32Regs:$dst),
905 (ins Float32Regs:$a, f32imm:$b),
906 "div.rn.f32 \t$dst, $a, $b;",
907 [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
908 Requires<[reqPTX20]>;
914 def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
915 "rsqrt.approx.f32 \t$dst, $b;", []>;
917 // Convert 1.0f/sqrt(x) to rsqrt.approx.f32. (There is an rsqrt.approx.f64, but
918 // it's emulated in software.)
919 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
920 (RSQRTF32approx1r Float32Regs:$b)>,
921 Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
923 multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
924 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
925 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
926 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
928 def rri : NVPTXInst<(outs RC:$dst),
929 (ins RC:$a, RC:$b, ImmCls:$c),
930 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
931 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
933 def rir : NVPTXInst<(outs RC:$dst),
934 (ins RC:$a, ImmCls:$b, RC:$c),
935 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
936 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
938 def rii : NVPTXInst<(outs RC:$dst),
939 (ins RC:$a, ImmCls:$b, ImmCls:$c),
940 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
941 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
945 defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
946 defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
947 defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
950 def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
951 "sin.approx.f32 \t$dst, $src;",
952 [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
953 def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
954 "cos.approx.f32 \t$dst, $src;",
955 [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
957 // Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
958 // i.e. "poor man's fmod()"
961 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
962 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
963 (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
965 Requires<[doF32FTZ]>;
966 def : Pat<(frem Float32Regs:$x, fpimm:$y),
967 (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
968 (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
970 Requires<[doF32FTZ]>;
973 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
974 (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
975 (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
977 def : Pat<(frem Float32Regs:$x, fpimm:$y),
978 (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
979 (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
983 def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
984 (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
985 (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
987 def : Pat<(frem Float64Regs:$x, fpimm:$y),
988 (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
989 (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
992 //-----------------------------------
993 // Bitwise operations
994 //-----------------------------------
996 // Template for three-arg bitwise operations. Takes three args, Creates .b16,
997 // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
998 multiclass BITWISE<string OpcStr, SDNode OpNode> {
1000 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1001 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
1002 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1004 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1005 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
1006 [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1008 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1009 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
1010 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1012 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1013 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
1014 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1016 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1017 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
1018 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1020 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1021 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
1022 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
1024 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1025 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
1026 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1028 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1029 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
1030 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1033 defm OR : BITWISE<"or", or>;
1034 defm AND : BITWISE<"and", and>;
1035 defm XOR : BITWISE<"xor", xor>;
1037 def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1038 "not.pred \t$dst, $src;",
1039 [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1040 def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1041 "not.b16 \t$dst, $src;",
1042 [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1043 def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1044 "not.b32 \t$dst, $src;",
1045 [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
1046 def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1047 "not.b64 \t$dst, $src;",
1048 [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1050 // Template for left/right shifts. Takes three operands,
1051 // [dest (reg), src (reg), shift (reg or imm)].
1052 // dest and src may be int64, int32, or int16, but shift is always int32.
1054 // This template also defines a 32-bit shift (imm, imm) instruction.
1055 multiclass SHIFT<string OpcStr, SDNode OpNode> {
1057 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1058 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1059 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
1061 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1062 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1063 [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1065 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1066 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1067 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1069 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1070 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1071 [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
1073 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1074 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1075 [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1077 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1078 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1079 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
1081 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1082 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1083 [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1086 defm SHL : SHIFT<"shl.b", shl>;
1087 defm SRA : SHIFT<"shr.s", sra>;
1088 defm SRL : SHIFT<"shr.u", srl>;
1091 // Rotate: Use ptx shf instruction if available.
1094 // 32 bit r2 = rotl r1, n
1096 // r2 = shf.l r1, r1, n
1098 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1099 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1100 [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
1101 Requires<[hasHWROT32]>;
1104 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1105 "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1106 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1107 Requires<[hasHWROT32]>;
1109 // 32 bit r2 = rotr r1, n
1111 // r2 = shf.r r1, r1, n
1113 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1114 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1115 [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
1116 Requires<[hasHWROT32]>;
1119 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1120 "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1121 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1122 Requires<[hasHWROT32]>;
1124 // 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1.
1126 NVPTXInst<(outs Int32Regs:$dst),
1127 (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1129 ".reg .b32 %lhs;\n\t"
1130 ".reg .b32 %rhs;\n\t"
1131 "shl.b32 \t%lhs, $src, $amt1;\n\t"
1132 "shr.b32 \t%rhs, $src, $amt2;\n\t"
1133 "add.u32 \t$dst, %lhs, %rhs;\n\t"
1137 def SUB_FRM_32 : SDNodeXForm<imm, [{
1138 return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1141 def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
1142 (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1143 Requires<[noHWROT32]>;
1144 def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
1145 (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1146 Requires<[noHWROT32]>;
1148 // 32-bit software rotate left by register.
1150 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1152 ".reg .b32 %lhs;\n\t"
1153 ".reg .b32 %rhs;\n\t"
1154 ".reg .b32 %amt2;\n\t"
1155 "shl.b32 \t%lhs, $src, $amt;\n\t"
1156 "sub.s32 \t%amt2, 32, $amt;\n\t"
1157 "shr.b32 \t%rhs, $src, %amt2;\n\t"
1158 "add.u32 \t$dst, %lhs, %rhs;\n\t"
1160 [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1161 Requires<[noHWROT32]>;
1163 // 32-bit software rotate right by register.
1165 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1167 ".reg .b32 %lhs;\n\t"
1168 ".reg .b32 %rhs;\n\t"
1169 ".reg .b32 %amt2;\n\t"
1170 "shr.b32 \t%lhs, $src, $amt;\n\t"
1171 "sub.s32 \t%amt2, 32, $amt;\n\t"
1172 "shl.b32 \t%rhs, $src, %amt2;\n\t"
1173 "add.u32 \t$dst, %lhs, %rhs;\n\t"
1175 [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1176 Requires<[noHWROT32]>;
1178 // 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1.
1180 NVPTXInst<(outs Int64Regs:$dst),
1181 (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1183 ".reg .b64 %lhs;\n\t"
1184 ".reg .b64 %rhs;\n\t"
1185 "shl.b64 \t%lhs, $src, $amt1;\n\t"
1186 "shr.b64 \t%rhs, $src, $amt2;\n\t"
1187 "add.u64 \t$dst, %lhs, %rhs;\n\t"
1191 def SUB_FRM_64 : SDNodeXForm<imm, [{
1192 return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1195 def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1196 (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1197 def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1198 (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1200 // 64-bit software rotate left by register.
1202 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1204 ".reg .b64 %lhs;\n\t"
1205 ".reg .b64 %rhs;\n\t"
1206 ".reg .u32 %amt2;\n\t"
1207 "shl.b64 \t%lhs, $src, $amt;\n\t"
1208 "sub.u32 \t%amt2, 64, $amt;\n\t"
1209 "shr.b64 \t%rhs, $src, %amt2;\n\t"
1210 "add.u64 \t$dst, %lhs, %rhs;\n\t"
1212 [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
1215 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1217 ".reg .b64 %lhs;\n\t"
1218 ".reg .b64 %rhs;\n\t"
1219 ".reg .u32 %amt2;\n\t"
1220 "shr.b64 \t%lhs, $src, $amt;\n\t"
1221 "sub.u32 \t%amt2, 64, $amt;\n\t"
1222 "shl.b64 \t%rhs, $src, %amt2;\n\t"
1223 "add.u64 \t$dst, %lhs, %rhs;\n\t"
1225 [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
1228 // Funnnel shift in clamp mode
1231 // Create SDNodes so they can be used in the DAG code, e.g.
1232 // NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1233 def SDTIntShiftDOp :
1234 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
1235 SDTCisInt<0>, SDTCisInt<3>]>;
1236 def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1237 def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1240 NVPTXInst<(outs Int32Regs:$dst),
1241 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1242 "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1243 [(set Int32Regs:$dst,
1244 (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1247 NVPTXInst<(outs Int32Regs:$dst),
1248 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1249 "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1250 [(set Int32Regs:$dst,
1251 (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1254 // BFE - bit-field extract
1257 // Template for BFE instructions. Takes four args,
1258 // [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1259 // Start may be an imm only if end is also an imm. FIXME: Is this a
1260 // restriction in PTX?
1262 // dest and src may be int32 or int64, but start and end are always int32.
1263 multiclass BFE<string TyStr, RegisterClass RC> {
1265 : NVPTXInst<(outs RC:$d),
1266 (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1267 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1269 : NVPTXInst<(outs RC:$d),
1270 (ins RC:$a, Int32Regs:$b, i32imm:$c),
1271 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1273 : NVPTXInst<(outs RC:$d),
1274 (ins RC:$a, i32imm:$b, i32imm:$c),
1275 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1278 let hasSideEffects = 0 in {
1279 defm BFE_S32 : BFE<"s32", Int32Regs>;
1280 defm BFE_U32 : BFE<"u32", Int32Regs>;
1281 defm BFE_S64 : BFE<"s64", Int64Regs>;
1282 defm BFE_U64 : BFE<"u64", Int64Regs>;
1285 //-----------------------------------
1286 // Comparison instructions (setp, set)
1287 //-----------------------------------
1289 // FIXME: This doesn't cover versions of set and setp that combine with a
1290 // boolean predicate, e.g. setp.eq.and.b16.
1292 let hasSideEffects = 0 in {
1293 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1295 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1296 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1297 "\t$dst, $a, $b;"), []>;
1299 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1300 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1301 "\t$dst, $a, $b;"), []>;
1303 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1304 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1305 "\t$dst, $a, $b;"), []>;
1309 defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1310 defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1311 defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1312 defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1313 defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1314 defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1315 defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1316 defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1317 defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1318 defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1319 defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1321 // FIXME: This doesn't appear to be correct. The "set" mnemonic has the form
1322 // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1323 // reg, either u32, s32, or f32. Anyway these aren't used at the moment.
1325 let hasSideEffects = 0 in {
1326 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1327 def rr : NVPTXInst<(outs Int32Regs:$dst),
1328 (ins RC:$a, RC:$b, CmpMode:$cmp),
1329 !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
1330 def ri : NVPTXInst<(outs Int32Regs:$dst),
1331 (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1332 !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
1333 def ir : NVPTXInst<(outs Int32Regs:$dst),
1334 (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1335 !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
1339 defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
1340 defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
1341 defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
1342 defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
1343 defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
1344 defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
1345 defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
1346 defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
1347 defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
1348 defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
1349 defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
1351 //-----------------------------------
1352 // Selection instructions (selp)
1353 //-----------------------------------
1355 // FIXME: Missing slct
1357 // selp instructions that don't have any pattern matches; we explicitly use
1358 // them within this file.
1359 let hasSideEffects = 0 in {
1360 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1361 def rr : NVPTXInst<(outs RC:$dst),
1362 (ins RC:$a, RC:$b, Int1Regs:$p),
1363 !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
1364 def ri : NVPTXInst<(outs RC:$dst),
1365 (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1366 !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
1367 def ir : NVPTXInst<(outs RC:$dst),
1368 (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1369 !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
1370 def ii : NVPTXInst<(outs RC:$dst),
1371 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1372 !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
1375 multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
1378 NVPTXInst<(outs RC:$dst),
1379 (ins RC:$a, RC:$b, Int1Regs:$p),
1380 !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
1381 [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
1383 NVPTXInst<(outs RC:$dst),
1384 (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1385 !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
1386 [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
1388 NVPTXInst<(outs RC:$dst),
1389 (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1390 !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
1391 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
1393 NVPTXInst<(outs RC:$dst),
1394 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1395 !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
1396 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
1400 // Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
1402 defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
1403 defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
1404 defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
1405 defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
1406 defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
1407 defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
1408 defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
1409 defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
1410 defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
1411 defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
1412 defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
1414 //-----------------------------------
1415 // Data Movement (Load / Store, Move)
1416 //-----------------------------------
1418 def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
1420 def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
1423 def MEMri : Operand<i32> {
1424 let PrintMethod = "printMemOperand";
1425 let MIOperandInfo = (ops Int32Regs, i32imm);
1427 def MEMri64 : Operand<i64> {
1428 let PrintMethod = "printMemOperand";
1429 let MIOperandInfo = (ops Int64Regs, i64imm);
1432 def imem : Operand<iPTR> {
1433 let PrintMethod = "printOperand";
1436 def imemAny : Operand<iPTRAny> {
1437 let PrintMethod = "printOperand";
1440 def LdStCode : Operand<i32> {
1441 let PrintMethod = "printLdStCode";
1444 def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
1445 def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
1447 // Load a memory address into a u32 or u64 register.
1448 def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
1449 "mov.u32 \t$dst, $a;",
1450 [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1451 def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
1452 "mov.u64 \t$dst, $a;",
1453 [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1455 // Get pointer to local stack.
1456 let hasSideEffects = 0 in {
1457 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
1458 "mov.u32 \t$d, __local_depot$num;", []>;
1459 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
1460 "mov.u64 \t$d, __local_depot$num;", []>;
1464 // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
1465 let IsSimpleMove=1, hasSideEffects=0 in {
1466 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
1467 "mov.pred \t$dst, $sss;", []>;
1468 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
1469 "mov.u16 \t$dst, $sss;", []>;
1470 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
1471 "mov.u32 \t$dst, $sss;", []>;
1472 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
1473 "mov.u64 \t$dst, $sss;", []>;
1475 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1476 "mov.f32 \t$dst, $src;", []>;
1477 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
1478 "mov.f64 \t$dst, $src;", []>;
1481 def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
1482 "mov.pred \t$dst, $src;",
1483 [(set Int1Regs:$dst, imm:$src)]>;
1484 def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1485 "mov.u16 \t$dst, $src;",
1486 [(set Int16Regs:$dst, imm:$src)]>;
1487 def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
1488 "mov.u32 \t$dst, $src;",
1489 [(set Int32Regs:$dst, imm:$src)]>;
1490 def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
1491 "mov.u64 \t$dst, $src;",
1492 [(set Int64Regs:$dst, imm:$src)]>;
1494 def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
1495 "mov.f32 \t$dst, $src;",
1496 [(set Float32Regs:$dst, fpimm:$src)]>;
1497 def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
1498 "mov.f64 \t$dst, $src;",
1499 [(set Float64Regs:$dst, fpimm:$src)]>;
1501 def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
1503 //---- Copy Frame Index ----
1504 def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
1505 "add.u32 \t$dst, ${addr:add};",
1506 [(set Int32Regs:$dst, ADDRri:$addr)]>;
1507 def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
1508 "add.u64 \t$dst, ${addr:add};",
1509 [(set Int64Regs:$dst, ADDRri64:$addr)]>;
1511 //-----------------------------------
1512 // Comparison and Selection
1513 //-----------------------------------
1515 multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
1516 Instruction setp_16rr,
1517 Instruction setp_16ri,
1518 Instruction setp_16ir,
1519 Instruction setp_32rr,
1520 Instruction setp_32ri,
1521 Instruction setp_32ir,
1522 Instruction setp_64rr,
1523 Instruction setp_64ri,
1524 Instruction setp_64ir,
1525 Instruction set_16rr,
1526 Instruction set_16ri,
1527 Instruction set_16ir,
1528 Instruction set_32rr,
1529 Instruction set_32ri,
1530 Instruction set_32ir,
1531 Instruction set_64rr,
1532 Instruction set_64ri,
1533 Instruction set_64ir> {
1535 def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
1536 (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1537 def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
1538 (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
1539 def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
1540 (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
1542 def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
1543 (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1544 def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
1545 (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
1546 def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
1547 (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
1549 def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
1550 (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1551 def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
1552 (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
1553 def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
1554 (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
1557 def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
1558 (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1559 def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
1560 (set_16ri Int16Regs:$a, imm:$b, Mode)>;
1561 def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
1562 (set_16ir imm:$a, Int16Regs:$b, Mode)>;
1564 def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
1565 (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1566 def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
1567 (set_32ri Int32Regs:$a, imm:$b, Mode)>;
1568 def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
1569 (set_32ir imm:$a, Int32Regs:$b, Mode)>;
1571 def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
1572 (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1573 def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
1574 (set_64ri Int64Regs:$a, imm:$b, Mode)>;
1575 def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
1576 (set_64ir imm:$a, Int64Regs:$b, Mode)>;
1579 multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
1580 : ISET_FORMAT<OpNode, Mode,
1581 SETP_s16rr, SETP_s16ri, SETP_s16ir,
1582 SETP_s32rr, SETP_s32ri, SETP_s32ir,
1583 SETP_s64rr, SETP_s64ri, SETP_s64ir,
1584 SET_s16rr, SET_s16ri, SET_s16ir,
1585 SET_s32rr, SET_s32ri, SET_s32ir,
1586 SET_s64rr, SET_s64ri, SET_s64ir> {
1587 // TableGen doesn't like empty multiclasses.
1588 def : PatLeaf<(i32 0)>;
1591 multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
1592 : ISET_FORMAT<OpNode, Mode,
1593 SETP_u16rr, SETP_u16ri, SETP_u16ir,
1594 SETP_u32rr, SETP_u32ri, SETP_u32ir,
1595 SETP_u64rr, SETP_u64ri, SETP_u64ir,
1596 SET_u16rr, SET_u16ri, SET_u16ir,
1597 SET_u32rr, SET_u32ri, SET_u32ir,
1598 SET_u64rr, SET_u64ri, SET_u64ir> {
1599 // TableGen doesn't like empty multiclasses.
1600 def : PatLeaf<(i32 0)>;
1603 defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
1604 defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
1605 defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
1606 defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
1607 defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
1608 defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
1609 defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
1610 defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
1611 defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
1612 defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
1613 defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
1614 defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
1617 def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
1618 (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1619 def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
1620 (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1622 def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
1623 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1624 def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
1625 (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1627 // i1 compare -> i32
1628 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1629 (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1630 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1631 (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1635 multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
1637 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1638 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1639 Requires<[doF32FTZ]>;
1640 def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1641 (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1642 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1643 (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1644 Requires<[doF32FTZ]>;
1645 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1646 (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1647 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1648 (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1649 Requires<[doF32FTZ]>;
1650 def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1651 (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1654 def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
1655 (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1656 def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
1657 (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1658 def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
1659 (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1662 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1663 (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1664 Requires<[doF32FTZ]>;
1665 def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1666 (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1667 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1668 (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1669 Requires<[doF32FTZ]>;
1670 def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1671 (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1672 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1673 (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1674 Requires<[doF32FTZ]>;
1675 def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1676 (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1679 def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
1680 (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1681 def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
1682 (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1683 def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
1684 (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1687 defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
1688 defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
1689 defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
1690 defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
1691 defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
1692 defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
1694 defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
1695 defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
1696 defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
1697 defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
1698 defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
1699 defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
1701 defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
1702 defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
1703 defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
1704 defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
1705 defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
1706 defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
1708 defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
1709 defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
1711 // FIXME: What is this doing here? Can it be deleted?
1712 // def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
1713 // [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
1715 def SDTDeclareParamProfile :
1716 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1717 def SDTDeclareScalarParamProfile :
1718 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1719 def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
1720 def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
1721 def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
1722 def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1723 def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1724 def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1725 def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
1726 def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
1727 def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1728 def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1729 def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
1730 def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
1731 def SDTCallValProfile : SDTypeProfile<1, 0, []>;
1732 def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
1733 def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1734 def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
1735 def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
1736 def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
1739 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
1740 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1741 def DeclareScalarParam :
1742 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
1743 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1744 def DeclareRetParam :
1745 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
1746 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1748 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
1749 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1751 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
1752 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1754 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
1755 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1757 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
1758 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1760 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
1761 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1762 def PrintConvergentCall :
1763 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
1764 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1766 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
1767 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1768 def PrintConvergentCallUni :
1769 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
1770 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1772 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
1773 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1775 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
1776 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1778 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
1779 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1781 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
1782 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1784 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
1785 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1787 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
1788 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1790 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
1791 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1793 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
1794 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1796 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
1797 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1799 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
1800 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1802 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
1803 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1805 SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
1806 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1808 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
1810 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
1811 [SDNPHasChain, SDNPSideEffect]>;
1813 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
1814 [SDNPHasChain, SDNPSideEffect]>;
1816 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
1817 [SDNPHasChain, SDNPSideEffect]>;
1818 def PseudoUseParam :
1819 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
1820 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1822 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
1823 [SDNPHasChain, SDNPSideEffect]>;
1825 let mayLoad = 1 in {
1826 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
1827 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
1828 !strconcat(!strconcat("ld.param", opstr),
1829 "\t$dst, [retval0+$b];"),
1832 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
1833 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
1834 !strconcat("ld.param.v2", opstr,
1835 "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
1837 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
1838 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
1841 !strconcat("ld.param.v4", opstr,
1842 "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
1846 class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
1847 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
1848 !strconcat("mov", opstr, "\t$dst, retval$b;"),
1849 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
1851 let mayStore = 1 in {
1852 class StoreParamInst<NVPTXRegClass regclass, string opstr> :
1853 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
1854 !strconcat("st.param", opstr, "\t[param$a+$b], $val;"),
1857 class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
1858 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
1859 i32imm:$a, i32imm:$b),
1860 !strconcat("st.param.v2", opstr,
1861 "\t[param$a+$b], {{$val, $val2}};"),
1864 class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
1865 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
1866 regclass:$val4, i32imm:$a,
1868 !strconcat("st.param.v4", opstr,
1869 "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
1872 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
1873 NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
1874 !strconcat("st.param", opstr, "\t[func_retval0+$a], $val;"),
1877 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
1878 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
1879 !strconcat("st.param.v2", opstr,
1880 "\t[func_retval0+$a], {{$val, $val2}};"),
1883 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
1885 (ins regclass:$val, regclass:$val2, regclass:$val3,
1886 regclass:$val4, i32imm:$a),
1887 !strconcat("st.param.v4", opstr,
1888 "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
1893 multiclass CALL<string OpcStr, SDNode OpNode> {
1894 def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
1895 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
1896 def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
1897 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
1898 def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
1899 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
1900 def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
1901 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
1902 def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
1903 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
1904 [(OpNode (i32 4))]>;
1905 def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
1906 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
1907 [(OpNode (i32 5))]>;
1908 def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
1909 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
1911 [(OpNode (i32 6))]>;
1912 def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
1913 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
1914 "retval5, retval6), "),
1915 [(OpNode (i32 7))]>;
1916 def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
1917 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
1918 "retval5, retval6, retval7), "),
1919 [(OpNode (i32 8))]>;
1923 defm Call : CALL<"call", PrintCall>;
1924 defm CallUni : CALL<"call.uni", PrintCallUni>;
1926 // Convergent call instructions. These are identical to regular calls, except
1927 // they have the isConvergent bit set.
1928 let isConvergent=1 in {
1929 defm ConvergentCall : CALL<"call", PrintConvergentCall>;
1930 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
1933 def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">;
1934 def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">;
1935 def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">;
1936 def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">;
1937 def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">;
1938 def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">;
1939 def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">;
1940 def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
1941 def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
1942 def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
1943 def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
1944 def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
1945 def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
1946 def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
1947 def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
1948 def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
1950 def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">;
1951 def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">;
1953 def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">;
1954 def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">;
1955 def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">;
1956 def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
1957 def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
1958 def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">;
1960 def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">;
1961 def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">;
1962 def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">;
1964 def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">;
1965 def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">;
1966 def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">;
1967 def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">;
1968 def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">;
1970 def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
1971 def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
1972 def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
1973 def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
1974 def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
1975 def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
1976 def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
1977 def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">;
1978 def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
1979 def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
1980 def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
1982 def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
1983 def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
1984 def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
1985 def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
1986 def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
1988 def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
1989 def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
1990 def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
1991 def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
1993 class CallArgInst<NVPTXRegClass regclass> :
1994 NVPTXInst<(outs), (ins regclass:$a), "$a, ",
1995 [(CallArg (i32 0), regclass:$a)]>;
1997 class LastCallArgInst<NVPTXRegClass regclass> :
1998 NVPTXInst<(outs), (ins regclass:$a), "$a",
1999 [(LastCallArg (i32 0), regclass:$a)]>;
2001 def CallArgI64 : CallArgInst<Int64Regs>;
2002 def CallArgI32 : CallArgInst<Int32Regs>;
2003 def CallArgI16 : CallArgInst<Int16Regs>;
2004 def CallArgF64 : CallArgInst<Float64Regs>;
2005 def CallArgF32 : CallArgInst<Float32Regs>;
2007 def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2008 def LastCallArgI32 : LastCallArgInst<Int32Regs>;
2009 def LastCallArgI16 : LastCallArgInst<Int16Regs>;
2010 def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2011 def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2013 def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2014 [(CallArg (i32 0), (i32 imm:$a))]>;
2015 def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2016 [(LastCallArg (i32 0), (i32 imm:$a))]>;
2018 def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2019 [(CallArg (i32 1), (i32 imm:$a))]>;
2020 def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2021 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2023 def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2024 [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2025 def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2026 [(CallVoid Int32Regs:$addr)]>;
2027 def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2028 [(CallVoid Int64Regs:$addr)]>;
2029 def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2030 [(Prototype (i32 imm:$val))]>;
2032 def DeclareRetMemInst :
2033 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2034 ".param .align $align .b8 retval$num[$size];",
2035 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2036 def DeclareRetScalarInst :
2037 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2038 ".param .b$size retval$num;",
2039 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2040 def DeclareRetRegInst :
2041 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2042 ".reg .b$size retval$num;",
2043 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2045 def DeclareParamInst :
2046 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2047 ".param .align $align .b8 param$a[$size];",
2048 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2049 def DeclareScalarParamInst :
2050 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2051 ".param .b$size param$a;",
2052 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2053 def DeclareScalarRegInst :
2054 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2055 ".reg .b$size param$a;",
2056 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2058 class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
2059 NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2060 !strconcat("mov", asmstr, "\t$dst, $src;"),
2061 [(set regclass:$dst, (MoveParam regclass:$src))]>;
2063 def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
2064 def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
2066 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2067 "cvt.u16.u32\t$dst, $src;",
2068 [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
2069 def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
2070 def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
2072 class PseudoUseParamInst<NVPTXRegClass regclass> :
2073 NVPTXInst<(outs), (ins regclass:$src),
2074 "// Pseudo use of $src",
2075 [(PseudoUseParam regclass:$src)]>;
2077 def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
2078 def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
2079 def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
2080 def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
2081 def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
2085 // Load / Store Handling
2087 multiclass LD<NVPTXRegClass regclass> {
2088 def _avar : NVPTXInst<
2089 (outs regclass:$dst),
2090 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2091 i32imm:$fromWidth, imem:$addr),
2092 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2093 "\t$dst, [$addr];", []>;
2094 def _areg : NVPTXInst<
2095 (outs regclass:$dst),
2096 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2097 i32imm:$fromWidth, Int32Regs:$addr),
2098 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2099 "\t$dst, [$addr];", []>;
2100 def _areg_64 : NVPTXInst<
2101 (outs regclass:$dst),
2102 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2103 i32imm:$fromWidth, Int64Regs:$addr),
2104 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2105 "\t$dst, [$addr];", []>;
2106 def _ari : NVPTXInst<
2107 (outs regclass:$dst),
2108 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2109 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2110 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2111 "\t$dst, [$addr+$offset];", []>;
2112 def _ari_64 : NVPTXInst<
2113 (outs regclass:$dst),
2114 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2115 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2116 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2117 "\t$dst, [$addr+$offset];", []>;
2118 def _asi : NVPTXInst<
2119 (outs regclass:$dst),
2120 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2121 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2122 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2123 "\t$dst, [$addr+$offset];", []>;
2126 let mayLoad=1, hasSideEffects=0 in {
2127 defm LD_i8 : LD<Int16Regs>;
2128 defm LD_i16 : LD<Int16Regs>;
2129 defm LD_i32 : LD<Int32Regs>;
2130 defm LD_i64 : LD<Int64Regs>;
2131 defm LD_f32 : LD<Float32Regs>;
2132 defm LD_f64 : LD<Float64Regs>;
2135 multiclass ST<NVPTXRegClass regclass> {
2136 def _avar : NVPTXInst<
2138 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2139 LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2140 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2141 " \t[$addr], $src;", []>;
2142 def _areg : NVPTXInst<
2144 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
2145 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2146 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2147 " \t[$addr], $src;", []>;
2148 def _areg_64 : NVPTXInst<
2150 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2151 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2152 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2153 " \t[$addr], $src;", []>;
2154 def _ari : NVPTXInst<
2156 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2157 LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
2158 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2159 " \t[$addr+$offset], $src;", []>;
2160 def _ari_64 : NVPTXInst<
2162 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2163 LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
2164 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2165 " \t[$addr+$offset], $src;", []>;
2166 def _asi : NVPTXInst<
2168 (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2169 LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
2170 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2171 " \t[$addr+$offset], $src;", []>;
2174 let mayStore=1, hasSideEffects=0 in {
2175 defm ST_i8 : ST<Int16Regs>;
2176 defm ST_i16 : ST<Int16Regs>;
2177 defm ST_i32 : ST<Int32Regs>;
2178 defm ST_i64 : ST<Int64Regs>;
2179 defm ST_f32 : ST<Float32Regs>;
2180 defm ST_f64 : ST<Float64Regs>;
2183 // The following is used only in and after vector elementizations. Vector
2184 // elementization happens at the machine instruction level, so the following
2185 // instructions never appear in the DAG.
2186 multiclass LD_VEC<NVPTXRegClass regclass> {
2187 def _v2_avar : NVPTXInst<
2188 (outs regclass:$dst1, regclass:$dst2),
2189 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2190 i32imm:$fromWidth, imem:$addr),
2191 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2192 "\t{{$dst1, $dst2}}, [$addr];", []>;
2193 def _v2_areg : NVPTXInst<
2194 (outs regclass:$dst1, regclass:$dst2),
2195 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2196 i32imm:$fromWidth, Int32Regs:$addr),
2197 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2198 "\t{{$dst1, $dst2}}, [$addr];", []>;
2199 def _v2_areg_64 : NVPTXInst<
2200 (outs regclass:$dst1, regclass:$dst2),
2201 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2202 i32imm:$fromWidth, Int64Regs:$addr),
2203 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2204 "\t{{$dst1, $dst2}}, [$addr];", []>;
2205 def _v2_ari : NVPTXInst<
2206 (outs regclass:$dst1, regclass:$dst2),
2207 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2208 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2209 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2210 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2211 def _v2_ari_64 : NVPTXInst<
2212 (outs regclass:$dst1, regclass:$dst2),
2213 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2214 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2215 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2216 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2217 def _v2_asi : NVPTXInst<
2218 (outs regclass:$dst1, regclass:$dst2),
2219 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2220 i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2221 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2222 "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2223 def _v4_avar : NVPTXInst<
2224 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2225 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2226 i32imm:$fromWidth, imem:$addr),
2227 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2228 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2229 def _v4_areg : NVPTXInst<
2230 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2231 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2232 i32imm:$fromWidth, Int32Regs:$addr),
2233 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2234 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2235 def _v4_areg_64 : NVPTXInst<
2236 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2237 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2238 i32imm:$fromWidth, Int64Regs:$addr),
2239 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2240 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2241 def _v4_ari : NVPTXInst<
2242 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2243 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2244 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2245 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2246 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2247 def _v4_ari_64 : NVPTXInst<
2248 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2249 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2250 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2251 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2252 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2253 def _v4_asi : NVPTXInst<
2254 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2255 (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2256 i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2257 "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2258 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2260 let mayLoad=1, hasSideEffects=0 in {
2261 defm LDV_i8 : LD_VEC<Int16Regs>;
2262 defm LDV_i16 : LD_VEC<Int16Regs>;
2263 defm LDV_i32 : LD_VEC<Int32Regs>;
2264 defm LDV_i64 : LD_VEC<Int64Regs>;
2265 defm LDV_f32 : LD_VEC<Float32Regs>;
2266 defm LDV_f64 : LD_VEC<Float64Regs>;
2269 multiclass ST_VEC<NVPTXRegClass regclass> {
2270 def _v2_avar : NVPTXInst<
2272 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2273 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
2274 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2275 "\t[$addr], {{$src1, $src2}};", []>;
2276 def _v2_areg : NVPTXInst<
2278 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2279 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
2280 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2281 "\t[$addr], {{$src1, $src2}};", []>;
2282 def _v2_areg_64 : NVPTXInst<
2284 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2285 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
2286 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2287 "\t[$addr], {{$src1, $src2}};", []>;
2288 def _v2_ari : NVPTXInst<
2290 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2291 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
2293 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2294 "\t[$addr+$offset], {{$src1, $src2}};", []>;
2295 def _v2_ari_64 : NVPTXInst<
2297 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2298 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
2300 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2301 "\t[$addr+$offset], {{$src1, $src2}};", []>;
2302 def _v2_asi : NVPTXInst<
2304 (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2305 LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
2307 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2308 "\t[$addr+$offset], {{$src1, $src2}};", []>;
2309 def _v4_avar : NVPTXInst<
2311 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2312 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2313 i32imm:$fromWidth, imem:$addr),
2314 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2315 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2316 def _v4_areg : NVPTXInst<
2318 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2319 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2320 i32imm:$fromWidth, Int32Regs:$addr),
2321 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2322 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2323 def _v4_areg_64 : NVPTXInst<
2325 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2326 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2327 i32imm:$fromWidth, Int64Regs:$addr),
2328 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2329 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2330 def _v4_ari : NVPTXInst<
2332 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2333 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2334 i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2335 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2336 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2337 def _v4_ari_64 : NVPTXInst<
2339 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2340 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2341 i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2342 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2343 "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2344 def _v4_asi : NVPTXInst<
2346 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2347 LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2348 i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2349 "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
2350 "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2353 let mayStore=1, hasSideEffects=0 in {
2354 defm STV_i8 : ST_VEC<Int16Regs>;
2355 defm STV_i16 : ST_VEC<Int16Regs>;
2356 defm STV_i32 : ST_VEC<Int32Regs>;
2357 defm STV_i64 : ST_VEC<Int64Regs>;
2358 defm STV_f32 : ST_VEC<Float32Regs>;
2359 defm STV_f64 : ST_VEC<Float64Regs>;
2363 //---- Conversion ----
2365 class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
2366 NVPTXRegClass regclassOut> :
2367 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
2368 !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
2369 [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
2371 def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
2372 def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
2373 def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
2374 def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
2376 // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
2377 // we cannot specify floating-point literals in isel patterns. Therefore, we
2378 // use an integer selp to select either 1 or 0 and then cvt to floating-point.
2381 def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
2382 (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2383 def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
2384 (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
2385 def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
2386 (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
2387 def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
2388 (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
2391 def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
2392 (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2393 def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
2394 (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
2395 def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
2396 (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
2397 def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
2398 (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
2401 def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
2402 (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2403 def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
2404 (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
2405 def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
2406 (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
2407 def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
2408 (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
2411 def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
2412 (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2413 def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
2414 (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
2415 def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
2416 (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
2417 def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
2418 (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
2422 def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
2423 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2424 def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2425 (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2426 def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2427 (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
2428 def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2429 (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2430 def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2431 (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
2432 def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2433 (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2434 def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2435 (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
2438 def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
2439 (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2440 def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2441 (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2442 def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2443 (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
2444 def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2445 (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2446 def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2447 (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
2448 def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2449 (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2450 def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2451 (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
2454 def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
2455 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2456 def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
2457 (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
2458 def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
2459 (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
2460 def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
2461 (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
2464 def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
2465 (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2466 def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
2467 (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
2468 def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
2469 (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
2470 def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
2471 (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
2474 def : Pat<(i16 (sext Int1Regs:$a)),
2475 (SELP_s16ii -1, 0, Int1Regs:$a)>;
2476 def : Pat<(i32 (sext Int1Regs:$a)),
2477 (SELP_s32ii -1, 0, Int1Regs:$a)>;
2478 def : Pat<(i64 (sext Int1Regs:$a)),
2479 (SELP_s64ii -1, 0, Int1Regs:$a)>;
2482 def : Pat<(i16 (zext Int1Regs:$a)),
2483 (SELP_u16ii 1, 0, Int1Regs:$a)>;
2484 def : Pat<(i32 (zext Int1Regs:$a)),
2485 (SELP_u32ii 1, 0, Int1Regs:$a)>;
2486 def : Pat<(i64 (zext Int1Regs:$a)),
2487 (SELP_u64ii 1, 0, Int1Regs:$a)>;
2490 def : Pat<(i16 (anyext Int1Regs:$a)),
2491 (SELP_u16ii -1, 0, Int1Regs:$a)>;
2492 def : Pat<(i32 (anyext Int1Regs:$a)),
2493 (SELP_u32ii -1, 0, Int1Regs:$a)>;
2494 def : Pat<(i64 (anyext Int1Regs:$a)),
2495 (SELP_u64ii -1, 0, Int1Regs:$a)>;
2498 def : Pat<(i32 (sext Int16Regs:$a)),
2499 (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
2500 def : Pat<(i64 (sext Int16Regs:$a)),
2501 (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
2504 def : Pat<(i32 (zext Int16Regs:$a)),
2505 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2506 def : Pat<(i64 (zext Int16Regs:$a)),
2507 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2510 def : Pat<(i32 (anyext Int16Regs:$a)),
2511 (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2512 def : Pat<(i64 (anyext Int16Regs:$a)),
2513 (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2516 def : Pat<(i64 (sext Int32Regs:$a)),
2517 (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
2520 def : Pat<(i64 (zext Int32Regs:$a)),
2521 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2524 def : Pat<(i64 (anyext Int32Regs:$a)),
2525 (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2529 def : Pat<(i32 (trunc Int64Regs:$a)),
2530 (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
2531 def : Pat<(i16 (trunc Int64Regs:$a)),
2532 (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
2533 def : Pat<(i1 (trunc Int64Regs:$a)),
2534 (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
2537 def : Pat<(i16 (trunc Int32Regs:$a)),
2538 (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
2539 def : Pat<(i1 (trunc Int32Regs:$a)),
2540 (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
2543 def : Pat<(i1 (trunc Int16Regs:$a)),
2544 (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
2547 def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
2548 def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
2549 def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
2550 def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
2551 def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
2552 def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
2555 // Select instructions with 32-bit predicates
2556 def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
2557 (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
2558 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2559 def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
2560 (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
2561 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2562 def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
2563 (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
2564 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2565 def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
2566 (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
2567 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2568 def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
2569 (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
2570 (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2573 let hasSideEffects = 0 in {
2574 // pack a set of smaller int registers to a larger int register
2575 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
2576 (ins Int16Regs:$s1, Int16Regs:$s2,
2577 Int16Regs:$s3, Int16Regs:$s4),
2578 "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>;
2579 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
2580 (ins Int16Regs:$s1, Int16Regs:$s2),
2581 "mov.b32\t$d, {{$s1, $s2}};", []>;
2582 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
2583 (ins Int32Regs:$s1, Int32Regs:$s2),
2584 "mov.b64\t$d, {{$s1, $s2}};", []>;
2585 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
2586 (ins Float32Regs:$s1, Float32Regs:$s2),
2587 "mov.b64\t$d, {{$s1, $s2}};", []>;
2589 // unpack a larger int register to a set of smaller int registers
2590 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
2591 Int16Regs:$d3, Int16Regs:$d4),
2593 "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>;
2594 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
2596 "mov.b32\t{{$d1, $d2}}, $s;", []>;
2597 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
2599 "mov.b64\t{{$d1, $d2}}, $s;", []>;
2600 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
2601 (ins Float64Regs:$s),
2602 "mov.b64\t{{$d1, $d2}}, $s;", []>;
2605 // Count leading zeros
2606 let hasSideEffects = 0 in {
2607 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
2608 "clz.b32\t$d, $a;", []>;
2609 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
2610 "clz.b64\t$d, $a;", []>;
2613 // 32-bit has a direct PTX instruction
2614 def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
2616 // For 64-bit, the result in PTX is actually 32-bit so we zero-extend
2617 // to 64-bit to match the LLVM semantics
2618 def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
2620 // For 16-bit, we zero-extend to 32-bit, then trunc the result back
2621 // to 16-bits (ctlz of a 16-bit value is guaranteed to require less
2622 // than 16 bits to store). We also need to subtract 16 because the
2623 // high-order 16 zeros were counted.
2624 def : Pat<(ctlz Int16Regs:$a),
2625 (SUBi16ri (CVT_u16_u32 (CLZr32
2626 (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
2630 let hasSideEffects = 0 in {
2631 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
2632 "popc.b32\t$d, $a;", []>;
2633 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
2634 "popc.b64\t$d, $a;", []>;
2637 // 32-bit has a direct PTX instruction
2638 def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
2640 // For 64-bit, the result in PTX is actually 32-bit so we zero-extend
2641 // to 64-bit to match the LLVM semantics
2642 def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
2644 // For 16-bit, we zero-extend to 32-bit, then trunc the result back
2645 // to 16-bits (ctpop of a 16-bit value is guaranteed to require less
2646 // than 16 bits to store)
2647 def : Pat<(ctpop Int16Regs:$a),
2648 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
2650 // fpround f64 -> f32
2651 def : Pat<(f32 (fpround Float64Regs:$a)),
2652 (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
2653 def : Pat<(f32 (fpround Float64Regs:$a)),
2654 (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
2656 // fpextend f32 -> f64
2657 def : Pat<(f64 (fpextend Float32Regs:$a)),
2658 (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
2659 def : Pat<(f64 (fpextend Float32Regs:$a)),
2660 (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
2662 def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
2663 [SDNPHasChain, SDNPOptInGlue]>;
2665 // fceil, ffloor, fround, ftrunc.
2667 def : Pat<(fceil Float32Regs:$a),
2668 (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
2669 def : Pat<(fceil Float32Regs:$a),
2670 (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
2671 def : Pat<(fceil Float64Regs:$a),
2672 (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
2674 def : Pat<(ffloor Float32Regs:$a),
2675 (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
2676 def : Pat<(ffloor Float32Regs:$a),
2677 (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
2678 def : Pat<(ffloor Float64Regs:$a),
2679 (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
2681 def : Pat<(fround Float32Regs:$a),
2682 (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
2683 def : Pat<(f32 (fround Float32Regs:$a)),
2684 (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
2685 def : Pat<(f64 (fround Float64Regs:$a)),
2686 (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
2688 def : Pat<(ftrunc Float32Regs:$a),
2689 (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2690 def : Pat<(ftrunc Float32Regs:$a),
2691 (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
2692 def : Pat<(ftrunc Float64Regs:$a),
2693 (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
2695 // nearbyint and rint are implemented as rounding to nearest even. This isn't
2696 // strictly correct, because it causes us to ignore the rounding mode. But it
2697 // matches what CUDA's "libm" does.
2699 def : Pat<(fnearbyint Float32Regs:$a),
2700 (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
2701 def : Pat<(fnearbyint Float32Regs:$a),
2702 (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
2703 def : Pat<(fnearbyint Float64Regs:$a),
2704 (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
2706 def : Pat<(frint Float32Regs:$a),
2707 (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
2708 def : Pat<(frint Float32Regs:$a),
2709 (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
2710 def : Pat<(frint Float64Regs:$a),
2711 (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
2714 //-----------------------------------
2716 //-----------------------------------
2718 let isTerminator=1 in {
2719 let isReturn=1, isBarrier=1 in
2720 def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
2723 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
2724 "@$a bra \t$target;",
2725 [(brcond Int1Regs:$a, bb:$target)]>;
2727 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
2728 "@!$a bra \t$target;", []>;
2730 let isBranch=1, isBarrier=1 in
2731 def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
2732 "bra.uni \t$target;", [(br bb:$target)]>;
2735 def : Pat<(brcond Int32Regs:$a, bb:$target),
2736 (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
2738 // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
2739 // conditional branch if the target block is the next block so that the code
2740 // can fall through to the target block. The invertion is done by 'xor
2741 // condition, 1', which will be translated to (setne condition, -1). Since ptx
2742 // supports '@!pred bra target', we should use it.
2743 def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
2744 (CBranchOther Int1Regs:$a, bb:$target)>;
2747 def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
2748 def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
2750 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
2751 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
2752 def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
2753 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
2756 def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
2757 def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
2758 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
2759 def calltarget : Operand<i32>;
2761 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
2764 def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
2765 def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
2767 // Pseudo instructions.
2768 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
2769 : NVPTXInst<outs, ins, asmstr, pattern>;
2772 NVPTXInst<(outs), (ins i32imm:$amt),
2773 "\\{ // callseq $amt\n"
2774 "\t.reg .b32 temp_param_reg;",
2775 [(callseq_start timm:$amt)]>;
2777 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
2778 "\\} // callseq $amt1",
2779 [(callseq_end timm:$amt1, timm:$amt2)]>;
2782 def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
2784 // Call prototype wrapper
2785 def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2787 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
2788 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2789 def ProtoIdent : Operand<i32> {
2790 let PrintMethod = "printProtoIdent";
2792 def CALL_PROTOTYPE :
2793 NVPTXInst<(outs), (ins ProtoIdent:$ident),
2794 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
2797 include "NVPTXIntrinsics.td"
2800 //-----------------------------------
2802 //-----------------------------------
2803 // BSWAP is currently expanded. The following is a more efficient
2804 // - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
2805 // - for sm_20, use pmpt (use vector scalar mov to get the pack and
2806 // unpack). sm_20 supports native 32-bit register, but not native 16-bit