1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
34 #define DEBUG_TYPE "amdgpu-legalinfo"
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43 unsigned MaxSize = 1024) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52 return [=](const LegalityQuery &Query) {
53 return Query.Types[TypeIdx].getSizeInBits() == Size;
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58 return [=](const LegalityQuery &Query) {
59 const LLT Ty = Query.Types[TypeIdx];
60 return Ty.isVector() &&
61 Ty.getNumElements() % 2 != 0 &&
62 Ty.getElementType().getSizeInBits() < 32 &&
63 Ty.getSizeInBits() % 32 != 0;
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getScalarType();
71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 const LLT EltTy = Ty.getElementType();
79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84 return [=](const LegalityQuery &Query) {
85 const LLT Ty = Query.Types[TypeIdx];
86 const LLT EltTy = Ty.getElementType();
87 unsigned Size = Ty.getSizeInBits();
88 unsigned Pieces = (Size + 63) / 64;
89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
94 // Increase the number of vector elements to reach the next multiple of 32-bit
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
100 const LLT EltTy = Ty.getElementType();
101 const int Size = Ty.getSizeInBits();
102 const int EltSize = EltTy.getSizeInBits();
103 const int NextMul32 = (Size + 31) / 32;
105 assert(EltSize < 32);
107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113 return [=](const LegalityQuery &Query) {
114 const LLT QueryTy = Query.Types[TypeIdx];
115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120 return [=](const LegalityQuery &Query) {
121 const LLT QueryTy = Query.Types[TypeIdx];
122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT QueryTy = Query.Types[TypeIdx];
129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
139 const int EltSize = Ty.getElementType().getSizeInBits();
140 return EltSize == 32 || EltSize == 64 ||
141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142 EltSize == 128 || EltSize == 256;
145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150 return [=](const LegalityQuery &Query) {
151 return Query.Types[TypeIdx].getElementType() == Type;
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156 return [=](const LegalityQuery &Query) {
157 const LLT Ty = Query.Types[TypeIdx];
158 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164 const GCNTargetMachine &TM)
166 using namespace TargetOpcode;
168 auto GetAddrSpacePtr = [&TM](unsigned AS) {
169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
172 const LLT S1 = LLT::scalar(1);
173 const LLT S8 = LLT::scalar(8);
174 const LLT S16 = LLT::scalar(16);
175 const LLT S32 = LLT::scalar(32);
176 const LLT S64 = LLT::scalar(64);
177 const LLT S96 = LLT::scalar(96);
178 const LLT S128 = LLT::scalar(128);
179 const LLT S256 = LLT::scalar(256);
180 const LLT S1024 = LLT::scalar(1024);
182 const LLT V2S16 = LLT::vector(2, 16);
183 const LLT V4S16 = LLT::vector(4, 16);
185 const LLT V2S32 = LLT::vector(2, 32);
186 const LLT V3S32 = LLT::vector(3, 32);
187 const LLT V4S32 = LLT::vector(4, 32);
188 const LLT V5S32 = LLT::vector(5, 32);
189 const LLT V6S32 = LLT::vector(6, 32);
190 const LLT V7S32 = LLT::vector(7, 32);
191 const LLT V8S32 = LLT::vector(8, 32);
192 const LLT V9S32 = LLT::vector(9, 32);
193 const LLT V10S32 = LLT::vector(10, 32);
194 const LLT V11S32 = LLT::vector(11, 32);
195 const LLT V12S32 = LLT::vector(12, 32);
196 const LLT V13S32 = LLT::vector(13, 32);
197 const LLT V14S32 = LLT::vector(14, 32);
198 const LLT V15S32 = LLT::vector(15, 32);
199 const LLT V16S32 = LLT::vector(16, 32);
200 const LLT V32S32 = LLT::vector(32, 32);
202 const LLT V2S64 = LLT::vector(2, 64);
203 const LLT V3S64 = LLT::vector(3, 64);
204 const LLT V4S64 = LLT::vector(4, 64);
205 const LLT V5S64 = LLT::vector(5, 64);
206 const LLT V6S64 = LLT::vector(6, 64);
207 const LLT V7S64 = LLT::vector(7, 64);
208 const LLT V8S64 = LLT::vector(8, 64);
209 const LLT V16S64 = LLT::vector(16, 64);
211 std::initializer_list<LLT> AllS32Vectors =
212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214 std::initializer_list<LLT> AllS64Vectors =
215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
225 const LLT CodePtr = FlatPtr;
227 const std::initializer_list<LLT> AddrSpaces64 = {
228 GlobalPtr, ConstantPtr, FlatPtr
231 const std::initializer_list<LLT> AddrSpaces32 = {
232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
235 const std::initializer_list<LLT> FPTypesBase = {
239 const std::initializer_list<LLT> FPTypes16 = {
243 const std::initializer_list<LLT> FPTypesPK16 = {
247 setAction({G_BRCOND, S1}, Legal); // VCC branches
248 setAction({G_BRCOND, S32}, Legal); // SCC branches
250 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
251 // elements for v3s16
252 getActionDefinitionsBuilder(G_PHI)
253 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
254 .legalFor(AllS32Vectors)
255 .legalFor(AllS64Vectors)
256 .legalFor(AddrSpaces64)
257 .legalFor(AddrSpaces32)
258 .clampScalar(0, S32, S256)
259 .widenScalarToNextPow2(0, 32)
260 .clampMaxNumElements(0, S32, 16)
261 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
262 .legalIf(isPointer(0));
264 if (ST.has16BitInsts()) {
265 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
266 .legalFor({S32, S16})
267 .clampScalar(0, S16, S32)
270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
272 .clampScalar(0, S32, S32)
276 // FIXME: Not really legal. Placeholder for custom lowering.
277 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
278 .legalFor({S32, S64})
279 .clampScalar(0, S32, S64)
280 .widenScalarToNextPow2(0, 32)
283 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
285 .clampScalar(0, S32, S32)
288 // Report legal for any types we can handle anywhere. For the cases only legal
289 // on the SALU, RegBankSelect will be able to re-legalize.
290 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
291 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
292 .clampScalar(0, S32, S64)
293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
295 .widenScalarToNextPow2(0)
298 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
299 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
300 .legalFor({{S32, S1}, {S32, S32}})
301 .clampScalar(0, S32, S32)
302 .scalarize(0); // TODO: Implement.
304 getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
307 getActionDefinitionsBuilder(G_BITCAST)
308 // Don't worry about the size constraint.
309 .legalIf(all(isRegisterType(0), isRegisterType(1)))
310 // FIXME: Testing hack
311 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
313 getActionDefinitionsBuilder(G_FCONSTANT)
314 .legalFor({S32, S64, S16})
315 .clampScalar(0, S16, S64);
317 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
318 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
319 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
321 .clampScalarOrElt(0, S32, S1024)
322 .legalIf(isMultiple32(0))
323 .widenScalarToNextPow2(0, 32)
324 .clampMaxNumElements(0, S32, 16);
327 // FIXME: i1 operands to intrinsics should always be legal, but other i1
328 // values may not be legal. We need to figure out how to distinguish
329 // between these two scenarios.
330 getActionDefinitionsBuilder(G_CONSTANT)
331 .legalFor({S1, S32, S64, S16, GlobalPtr,
332 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
333 .clampScalar(0, S32, S64)
334 .widenScalarToNextPow2(0)
335 .legalIf(isPointer(0));
337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
342 auto &FPOpActions = getActionDefinitionsBuilder(
343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344 .legalFor({S32, S64});
345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346 .customFor({S32, S64});
347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348 .customFor({S32, S64});
350 if (ST.has16BitInsts()) {
351 if (ST.hasVOP3PInsts())
352 FPOpActions.legalFor({S16, V2S16});
354 FPOpActions.legalFor({S16});
356 TrigActions.customFor({S16});
357 FDIVActions.customFor({S16});
360 auto &MinNumMaxNum = getActionDefinitionsBuilder({
361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
363 if (ST.hasVOP3PInsts()) {
364 MinNumMaxNum.customFor(FPTypesPK16)
365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366 .clampMaxNumElements(0, S16, 2)
367 .clampScalar(0, S16, S64)
369 } else if (ST.has16BitInsts()) {
370 MinNumMaxNum.customFor(FPTypes16)
371 .clampScalar(0, S16, S64)
374 MinNumMaxNum.customFor(FPTypesBase)
375 .clampScalar(0, S32, S64)
379 if (ST.hasVOP3PInsts())
380 FPOpActions.clampMaxNumElements(0, S16, 2);
384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
394 getActionDefinitionsBuilder({G_FNEG, G_FABS})
395 .legalFor(FPTypesPK16)
396 .clampMaxNumElements(0, S16, 2)
398 .clampScalar(0, S16, S64);
401 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
403 if (ST.has16BitInsts()) {
404 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
405 .legalFor({S32, S64, S16})
407 .clampScalar(0, S16, S64);
409 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
410 .legalFor({S32, S64})
412 .clampScalar(0, S32, S64);
415 getActionDefinitionsBuilder(G_FPTRUNC)
416 .legalFor({{S32, S64}, {S16, S32}})
419 getActionDefinitionsBuilder(G_FPEXT)
420 .legalFor({{S64, S32}, {S32, S16}})
421 .lowerFor({{S64, S16}}) // FIXME: Implement
424 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
425 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
427 getActionDefinitionsBuilder(G_FSUB)
428 // Use actual fsub instruction
430 // Must use fadd + fneg
431 .lowerFor({S64, S16, V2S16})
433 .clampScalar(0, S32, S64);
435 // Whether this is legal depends on the floating point mode for the function.
436 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
438 FMad.customFor({S32, S16});
440 FMad.customFor({S32});
444 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
445 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
446 {S32, S1}, {S64, S1}, {S16, S1},
449 {S64, LLT::scalar(33)},
450 {S32, S8}, {S32, LLT::scalar(24)}})
452 .clampScalar(0, S32, S64);
454 // TODO: Split s1->s64 during regbankselect for VALU.
455 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
456 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
457 .lowerFor({{S32, S64}})
458 .lowerIf(typeIs(1, S1))
459 .customFor({{S64, S64}});
460 if (ST.has16BitInsts())
461 IToFP.legalFor({{S16, S16}});
462 IToFP.clampScalar(1, S32, S64)
465 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
466 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
467 if (ST.has16BitInsts())
468 FPToI.legalFor({{S16, S16}});
470 FPToI.minScalar(1, S32);
472 FPToI.minScalar(0, S32)
475 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
479 if (ST.has16BitInsts()) {
480 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
481 .legalFor({S16, S32, S64})
482 .clampScalar(0, S16, S64)
484 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
485 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
486 .legalFor({S32, S64})
487 .clampScalar(0, S32, S64)
490 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
493 .clampScalar(0, S32, S64)
497 getActionDefinitionsBuilder(G_PTR_ADD)
498 .legalForCartesianProduct(AddrSpaces64, {S64})
499 .legalForCartesianProduct(AddrSpaces32, {S32})
502 getActionDefinitionsBuilder(G_PTR_MASK)
506 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
509 getActionDefinitionsBuilder(G_ICMP)
510 // The compare output type differs based on the register bank of the output,
511 // so make both s1 and s32 legal.
513 // Scalar compares producing output in scc will be promoted to s32, as that
514 // is the allocatable register type that will be needed for the copy from
515 // scc. This will be promoted during RegBankSelect, and we assume something
516 // before that won't try to use s32 result types.
518 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
520 .legalForCartesianProduct(
521 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
522 .legalForCartesianProduct(
523 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
524 if (ST.has16BitInsts()) {
525 CmpBuilder.legalFor({{S1, S16}});
529 .widenScalarToNextPow2(1)
530 .clampScalar(1, S32, S64)
532 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
534 getActionDefinitionsBuilder(G_FCMP)
535 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
536 .widenScalarToNextPow2(1)
537 .clampScalar(1, S32, S64)
540 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
541 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
542 G_FLOG, G_FLOG2, G_FLOG10})
546 // The 64-bit versions produce 32-bit results, but only on the SALU.
547 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
548 G_CTTZ, G_CTTZ_ZERO_UNDEF,
550 .legalFor({{S32, S32}, {S32, S64}})
551 .clampScalar(0, S32, S32)
552 .clampScalar(1, S32, S64)
554 .widenScalarToNextPow2(0, 32)
555 .widenScalarToNextPow2(1, 32);
557 // TODO: Expand for > s32
558 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
560 .clampScalar(0, S32, S32)
563 if (ST.has16BitInsts()) {
564 if (ST.hasVOP3PInsts()) {
565 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
566 .legalFor({S32, S16, V2S16})
567 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
568 .clampMaxNumElements(0, S16, 2)
569 .clampScalar(0, S16, S32)
570 .widenScalarToNextPow2(0)
573 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
574 .legalFor({S32, S16})
575 .widenScalarToNextPow2(0)
576 .clampScalar(0, S16, S32)
580 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
582 .clampScalar(0, S32, S32)
583 .widenScalarToNextPow2(0)
587 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
588 return [=](const LegalityQuery &Query) {
589 return Query.Types[TypeIdx0].getSizeInBits() <
590 Query.Types[TypeIdx1].getSizeInBits();
594 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
595 return [=](const LegalityQuery &Query) {
596 return Query.Types[TypeIdx0].getSizeInBits() >
597 Query.Types[TypeIdx1].getSizeInBits();
601 getActionDefinitionsBuilder(G_INTTOPTR)
602 // List the common cases
603 .legalForCartesianProduct(AddrSpaces64, {S64})
604 .legalForCartesianProduct(AddrSpaces32, {S32})
606 // Accept any address space as long as the size matches
607 .legalIf(sameSize(0, 1))
608 .widenScalarIf(smallerThan(1, 0),
609 [](const LegalityQuery &Query) {
610 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
612 .narrowScalarIf(greaterThan(1, 0),
613 [](const LegalityQuery &Query) {
614 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
617 getActionDefinitionsBuilder(G_PTRTOINT)
618 // List the common cases
619 .legalForCartesianProduct(AddrSpaces64, {S64})
620 .legalForCartesianProduct(AddrSpaces32, {S32})
622 // Accept any address space as long as the size matches
623 .legalIf(sameSize(0, 1))
624 .widenScalarIf(smallerThan(0, 1),
625 [](const LegalityQuery &Query) {
626 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
630 [](const LegalityQuery &Query) {
631 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
634 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
638 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
639 // handle some operations by just promoting the register during
640 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
641 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
643 // FIXME: Private element size.
644 case AMDGPUAS::PRIVATE_ADDRESS:
646 // FIXME: Check subtarget
647 case AMDGPUAS::LOCAL_ADDRESS:
648 return ST.useDS128() ? 128 : 64;
650 // Treat constant and global as identical. SMRD loads are sometimes usable
651 // for global loads (ideally constant address space should be eliminated)
652 // depending on the context. Legality cannot be context dependent, but
653 // RegBankSelect can split the load as necessary depending on the pointer
654 // register bank/uniformity and if the memory is invariant or not written in
656 case AMDGPUAS::CONSTANT_ADDRESS:
657 case AMDGPUAS::GLOBAL_ADDRESS:
664 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
665 const LLT DstTy = Query.Types[0];
667 // Split vector extloads.
668 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
669 unsigned Align = Query.MMODescrs[0].AlignInBits;
671 if (MemSize < DstTy.getSizeInBits())
672 MemSize = std::max(MemSize, Align);
674 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
677 const LLT PtrTy = Query.Types[1];
678 unsigned AS = PtrTy.getAddressSpace();
679 if (MemSize > maxSizeForAddrSpace(AS))
682 // Catch weird sized loads that don't evenly divide into the access sizes
683 // TODO: May be able to widen depending on alignment etc.
684 unsigned NumRegs = MemSize / 32;
685 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
688 if (Align < MemSize) {
689 const SITargetLowering *TLI = ST.getTargetLowering();
690 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
696 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
697 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
698 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
700 // TODO: Refine based on subtargets which support unaligned access or 128-bit
702 // TODO: Unsupported flat for SI.
704 for (unsigned Op : {G_LOAD, G_STORE}) {
705 const bool IsStore = Op == G_STORE;
707 auto &Actions = getActionDefinitionsBuilder(Op);
708 // Whitelist the common cases.
709 // TODO: Pointer loads
710 // TODO: Wide constant loads
711 // TODO: Only CI+ has 3x loads
712 // TODO: Loads to s16 on gfx9
713 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
714 {V2S32, GlobalPtr, 64, GlobalAlign32},
715 {V3S32, GlobalPtr, 96, GlobalAlign32},
716 {S96, GlobalPtr, 96, GlobalAlign32},
717 {V4S32, GlobalPtr, 128, GlobalAlign32},
718 {S128, GlobalPtr, 128, GlobalAlign32},
719 {S64, GlobalPtr, 64, GlobalAlign32},
720 {V2S64, GlobalPtr, 128, GlobalAlign32},
721 {V2S16, GlobalPtr, 32, GlobalAlign32},
722 {S32, GlobalPtr, 8, GlobalAlign8},
723 {S32, GlobalPtr, 16, GlobalAlign16},
725 {S32, LocalPtr, 32, 32},
726 {S64, LocalPtr, 64, 32},
727 {V2S32, LocalPtr, 64, 32},
728 {S32, LocalPtr, 8, 8},
729 {S32, LocalPtr, 16, 16},
730 {V2S16, LocalPtr, 32, 32},
732 {S32, PrivatePtr, 32, 32},
733 {S32, PrivatePtr, 8, 8},
734 {S32, PrivatePtr, 16, 16},
735 {V2S16, PrivatePtr, 32, 32},
737 {S32, FlatPtr, 32, GlobalAlign32},
738 {S32, FlatPtr, 16, GlobalAlign16},
739 {S32, FlatPtr, 8, GlobalAlign8},
740 {V2S16, FlatPtr, 32, GlobalAlign32},
742 {S32, ConstantPtr, 32, GlobalAlign32},
743 {V2S32, ConstantPtr, 64, GlobalAlign32},
744 {V3S32, ConstantPtr, 96, GlobalAlign32},
745 {V4S32, ConstantPtr, 128, GlobalAlign32},
746 {S64, ConstantPtr, 64, GlobalAlign32},
747 {S128, ConstantPtr, 128, GlobalAlign32},
748 {V2S32, ConstantPtr, 32, GlobalAlign32}});
750 .customIf(typeIs(1, Constant32Ptr))
752 [=](const LegalityQuery &Query) -> bool {
753 return !Query.Types[0].isVector() && needToSplitLoad(Query);
755 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
756 const LLT DstTy = Query.Types[0];
757 const LLT PtrTy = Query.Types[1];
759 const unsigned DstSize = DstTy.getSizeInBits();
760 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
763 if (DstSize > MemSize)
764 return std::make_pair(0, LLT::scalar(MemSize));
766 if (DstSize > 32 && (DstSize % 32 != 0)) {
767 // FIXME: Need a way to specify non-extload of larger size if
769 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
772 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
773 if (MemSize > MaxSize)
774 return std::make_pair(0, LLT::scalar(MaxSize));
776 unsigned Align = Query.MMODescrs[0].AlignInBits;
777 return std::make_pair(0, LLT::scalar(Align));
780 [=](const LegalityQuery &Query) -> bool {
781 return Query.Types[0].isVector() && needToSplitLoad(Query);
783 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
784 const LLT DstTy = Query.Types[0];
785 const LLT PtrTy = Query.Types[1];
787 LLT EltTy = DstTy.getElementType();
788 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
790 // Split if it's too large for the address space.
791 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
792 unsigned NumElts = DstTy.getNumElements();
793 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
795 // FIXME: Refine when odd breakdowns handled
796 // The scalars will need to be re-legalized.
797 if (NumPieces == 1 || NumPieces >= NumElts ||
798 NumElts % NumPieces != 0)
799 return std::make_pair(0, EltTy);
801 return std::make_pair(0,
802 LLT::vector(NumElts / NumPieces, EltTy));
805 // Need to split because of alignment.
806 unsigned Align = Query.MMODescrs[0].AlignInBits;
807 unsigned EltSize = EltTy.getSizeInBits();
808 if (EltSize > Align &&
809 (EltSize / Align < DstTy.getNumElements())) {
810 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
813 // May need relegalization for the scalars.
814 return std::make_pair(0, EltTy);
819 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
821 // TODO: Need a bitcast lower option?
823 .legalIf([=](const LegalityQuery &Query) {
824 const LLT Ty0 = Query.Types[0];
825 unsigned Size = Ty0.getSizeInBits();
826 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
827 unsigned Align = Query.MMODescrs[0].AlignInBits;
829 // FIXME: Widening store from alignment not valid.
831 MemSize = std::max(MemSize, Align);
833 // No extending vector loads.
834 if (Size > MemSize && Ty0.isVector())
846 return ST.hasDwordx3LoadStores();
854 .widenScalarToNextPow2(0)
855 // TODO: v3s32->v4s32 with alignment
856 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
859 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
860 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
861 {S32, GlobalPtr, 16, 2 * 8},
862 {S32, LocalPtr, 8, 8},
863 {S32, LocalPtr, 16, 16},
864 {S32, PrivatePtr, 8, 8},
865 {S32, PrivatePtr, 16, 16},
866 {S32, ConstantPtr, 8, 8},
867 {S32, ConstantPtr, 16, 2 * 8}});
868 if (ST.hasFlatAddressSpace()) {
869 ExtLoads.legalForTypesWithMemDesc(
870 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
873 ExtLoads.clampScalar(0, S32, S32)
874 .widenScalarToNextPow2(0)
875 .unsupportedIfMemSizeNotPow2()
878 auto &Atomics = getActionDefinitionsBuilder(
879 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
880 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
881 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
883 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
884 {S64, GlobalPtr}, {S64, LocalPtr}});
885 if (ST.hasFlatAddressSpace()) {
886 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
889 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
890 .legalFor({{S32, LocalPtr}});
892 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
894 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
895 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
896 {S32, FlatPtr}, {S64, FlatPtr}})
897 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
898 {S32, RegionPtr}, {S64, RegionPtr}});
900 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
903 // TODO: Pointer types, any 32-bit or 64-bit vector
905 // Condition should be s32 for scalar, s1 for vector.
906 getActionDefinitionsBuilder(G_SELECT)
907 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
908 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
909 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
910 .clampScalar(0, S16, S64)
911 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
912 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
914 .clampMaxNumElements(0, S32, 2)
915 .clampMaxNumElements(0, LocalPtr, 2)
916 .clampMaxNumElements(0, PrivatePtr, 2)
918 .widenScalarToNextPow2(0)
919 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
921 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
922 // be more flexible with the shift amount type.
923 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
924 .legalFor({{S32, S32}, {S64, S32}});
925 if (ST.has16BitInsts()) {
926 if (ST.hasVOP3PInsts()) {
927 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
928 .clampMaxNumElements(0, S16, 2);
930 Shifts.legalFor({{S16, S32}, {S16, S16}});
932 // TODO: Support 16-bit shift amounts
933 Shifts.clampScalar(1, S32, S32);
934 Shifts.clampScalar(0, S16, S64);
935 Shifts.widenScalarToNextPow2(0, 16);
937 // Make sure we legalize the shift amount type first, as the general
938 // expansion for the shifted type will produce much worse code if it hasn't
939 // been truncated already.
940 Shifts.clampScalar(1, S32, S32);
941 Shifts.clampScalar(0, S32, S64);
942 Shifts.widenScalarToNextPow2(0, 32);
946 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
947 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
948 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
949 unsigned IdxTypeIdx = 2;
951 getActionDefinitionsBuilder(Op)
952 .customIf([=](const LegalityQuery &Query) {
953 const LLT EltTy = Query.Types[EltTypeIdx];
954 const LLT VecTy = Query.Types[VecTypeIdx];
955 const LLT IdxTy = Query.Types[IdxTypeIdx];
956 return (EltTy.getSizeInBits() == 16 ||
957 EltTy.getSizeInBits() % 32 == 0) &&
958 VecTy.getSizeInBits() % 32 == 0 &&
959 VecTy.getSizeInBits() <= 1024 &&
960 IdxTy.getSizeInBits() == 32;
962 .clampScalar(EltTypeIdx, S32, S64)
963 .clampScalar(VecTypeIdx, S32, S64)
964 .clampScalar(IdxTypeIdx, S32, S32);
967 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
968 .unsupportedIf([=](const LegalityQuery &Query) {
969 const LLT &EltTy = Query.Types[1].getElementType();
970 return Query.Types[0] != EltTy;
973 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
974 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
975 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
977 // FIXME: Doesn't handle extract of illegal sizes.
978 getActionDefinitionsBuilder(Op)
979 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
980 // FIXME: Multiples of 16 should not be legal.
981 .legalIf([=](const LegalityQuery &Query) {
982 const LLT BigTy = Query.Types[BigTyIdx];
983 const LLT LitTy = Query.Types[LitTyIdx];
984 return (BigTy.getSizeInBits() % 32 == 0) &&
985 (LitTy.getSizeInBits() % 16 == 0);
988 [=](const LegalityQuery &Query) {
989 const LLT BigTy = Query.Types[BigTyIdx];
990 return (BigTy.getScalarSizeInBits() < 16);
992 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
994 [=](const LegalityQuery &Query) {
995 const LLT LitTy = Query.Types[LitTyIdx];
996 return (LitTy.getScalarSizeInBits() < 16);
998 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
999 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1000 .widenScalarToNextPow2(BigTyIdx, 32);
1004 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1005 .legalForCartesianProduct(AllS32Vectors, {S32})
1006 .legalForCartesianProduct(AllS64Vectors, {S64})
1007 .clampNumElements(0, V16S32, V32S32)
1008 .clampNumElements(0, V2S64, V16S64)
1009 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1011 if (ST.hasScalarPackInsts())
1012 BuildVector.legalFor({V2S16, S32});
1015 .minScalarSameAs(1, 0)
1016 .legalIf(isRegisterType(0))
1017 .minScalarOrElt(0, S32);
1019 if (ST.hasScalarPackInsts()) {
1020 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1021 .legalFor({V2S16, S32})
1024 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1028 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1029 .legalIf(isRegisterType(0));
1031 // TODO: Don't fully scalarize v2s16 pieces
1032 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1035 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1036 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1037 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1039 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1040 const LLT &Ty = Query.Types[TypeIdx];
1041 if (Ty.isVector()) {
1042 const LLT &EltTy = Ty.getElementType();
1043 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1045 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1051 auto &Builder = getActionDefinitionsBuilder(Op)
1052 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1053 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1054 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1056 .clampScalar(LitTyIdx, S16, S256)
1057 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1058 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1059 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1060 elementTypeIs(1, S16)),
1062 // Break up vectors with weird elements into scalars
1064 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1067 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1069 .clampScalar(BigTyIdx, S32, S1024)
1070 .lowerFor({{S16, V2S16}});
1072 if (Op == G_MERGE_VALUES) {
1073 Builder.widenScalarIf(
1074 // TODO: Use 16-bit shifts if legal for 8-bit values?
1075 [=](const LegalityQuery &Query) {
1076 const LLT Ty = Query.Types[LitTyIdx];
1077 return Ty.getSizeInBits() < 32;
1079 changeTo(LitTyIdx, S32));
1082 Builder.widenScalarIf(
1083 [=](const LegalityQuery &Query) {
1084 const LLT Ty = Query.Types[BigTyIdx];
1085 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1086 Ty.getSizeInBits() % 16 != 0;
1088 [=](const LegalityQuery &Query) {
1089 // Pick the next power of 2, or a multiple of 64 over 128.
1090 // Whichever is smaller.
1091 const LLT &Ty = Query.Types[BigTyIdx];
1092 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1093 if (NewSizeInBits >= 256) {
1094 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1095 if (RoundedTo < NewSizeInBits)
1096 NewSizeInBits = RoundedTo;
1098 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1100 .legalIf([=](const LegalityQuery &Query) {
1101 const LLT &BigTy = Query.Types[BigTyIdx];
1102 const LLT &LitTy = Query.Types[LitTyIdx];
1104 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1106 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1109 return BigTy.getSizeInBits() % 16 == 0 &&
1110 LitTy.getSizeInBits() % 16 == 0 &&
1111 BigTy.getSizeInBits() <= 1024;
1113 // Any vectors left are the wrong size. Scalarize them.
1118 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1120 getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
1122 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1125 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1126 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1127 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1131 verify(*ST.getInstrInfo());
1134 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1135 MachineRegisterInfo &MRI,
1136 MachineIRBuilder &B,
1137 GISelChangeObserver &Observer) const {
1138 switch (MI.getOpcode()) {
1139 case TargetOpcode::G_ADDRSPACE_CAST:
1140 return legalizeAddrSpaceCast(MI, MRI, B);
1141 case TargetOpcode::G_FRINT:
1142 return legalizeFrint(MI, MRI, B);
1143 case TargetOpcode::G_FCEIL:
1144 return legalizeFceil(MI, MRI, B);
1145 case TargetOpcode::G_INTRINSIC_TRUNC:
1146 return legalizeIntrinsicTrunc(MI, MRI, B);
1147 case TargetOpcode::G_SITOFP:
1148 return legalizeITOFP(MI, MRI, B, true);
1149 case TargetOpcode::G_UITOFP:
1150 return legalizeITOFP(MI, MRI, B, false);
1151 case TargetOpcode::G_FMINNUM:
1152 case TargetOpcode::G_FMAXNUM:
1153 case TargetOpcode::G_FMINNUM_IEEE:
1154 case TargetOpcode::G_FMAXNUM_IEEE:
1155 return legalizeMinNumMaxNum(MI, MRI, B);
1156 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1157 return legalizeExtractVectorElt(MI, MRI, B);
1158 case TargetOpcode::G_INSERT_VECTOR_ELT:
1159 return legalizeInsertVectorElt(MI, MRI, B);
1160 case TargetOpcode::G_FSIN:
1161 case TargetOpcode::G_FCOS:
1162 return legalizeSinCos(MI, MRI, B);
1163 case TargetOpcode::G_GLOBAL_VALUE:
1164 return legalizeGlobalValue(MI, MRI, B);
1165 case TargetOpcode::G_LOAD:
1166 return legalizeLoad(MI, MRI, B, Observer);
1167 case TargetOpcode::G_FMAD:
1168 return legalizeFMad(MI, MRI, B);
1169 case TargetOpcode::G_FDIV:
1170 return legalizeFDIV(MI, MRI, B);
1171 case TargetOpcode::G_ATOMIC_CMPXCHG:
1172 return legalizeAtomicCmpXChg(MI, MRI, B);
1177 llvm_unreachable("expected switch to return");
1180 Register AMDGPULegalizerInfo::getSegmentAperture(
1182 MachineRegisterInfo &MRI,
1183 MachineIRBuilder &B) const {
1184 MachineFunction &MF = B.getMF();
1185 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1186 const LLT S32 = LLT::scalar(32);
1188 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1190 if (ST.hasApertureRegs()) {
1191 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1193 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1194 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1195 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1196 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1197 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1198 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1200 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1201 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1202 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1204 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1205 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1207 B.buildInstr(AMDGPU::S_GETREG_B32)
1210 MRI.setType(GetReg, S32);
1212 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1213 B.buildInstr(TargetOpcode::G_SHL)
1214 .addDef(ApertureReg)
1216 .addUse(ShiftAmt.getReg(0));
1221 Register QueuePtr = MRI.createGenericVirtualRegister(
1222 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1224 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1225 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1228 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1229 // private_segment_aperture_base_hi.
1230 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1232 // TODO: can we be smarter about machine pointer info?
1233 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1234 MachineMemOperand *MMO = MF.getMachineMemOperand(
1236 MachineMemOperand::MOLoad |
1237 MachineMemOperand::MODereferenceable |
1238 MachineMemOperand::MOInvariant,
1240 MinAlign(64, StructOffset));
1242 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1245 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1246 B.buildLoad(LoadResult, LoadAddr, *MMO);
1250 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1251 MachineInstr &MI, MachineRegisterInfo &MRI,
1252 MachineIRBuilder &B) const {
1253 MachineFunction &MF = B.getMF();
1257 const LLT S32 = LLT::scalar(32);
1258 Register Dst = MI.getOperand(0).getReg();
1259 Register Src = MI.getOperand(1).getReg();
1261 LLT DstTy = MRI.getType(Dst);
1262 LLT SrcTy = MRI.getType(Src);
1263 unsigned DestAS = DstTy.getAddressSpace();
1264 unsigned SrcAS = SrcTy.getAddressSpace();
1266 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1268 assert(!DstTy.isVector());
1270 const AMDGPUTargetMachine &TM
1271 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1273 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1274 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1275 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1279 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1281 B.buildExtract(Dst, Src, 0);
1282 MI.eraseFromParent();
1286 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1287 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1288 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1290 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1291 // another. Merge operands are required to be the same type, but creating an
1292 // extra ptrtoint would be kind of pointless.
1293 auto HighAddr = B.buildConstant(
1294 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1295 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1296 MI.eraseFromParent();
1300 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1301 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1302 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1303 unsigned NullVal = TM.getNullPointerValue(DestAS);
1305 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1306 auto FlatNull = B.buildConstant(SrcTy, 0);
1308 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1310 // Extract low 32-bits of the pointer.
1311 B.buildExtract(PtrLo32, Src, 0);
1313 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1314 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1315 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1317 MI.eraseFromParent();
1321 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1324 if (!ST.hasFlatAddressSpace())
1328 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1330 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1332 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1333 if (!ApertureReg.isValid())
1336 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1337 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1339 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1341 // Coerce the type of the low half of the result so we can use merge_values.
1342 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1343 B.buildInstr(TargetOpcode::G_PTRTOINT)
1347 // TODO: Should we allow mismatched types but matching sizes in merges to
1348 // avoid the ptrtoint?
1349 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1350 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1352 MI.eraseFromParent();
1356 bool AMDGPULegalizerInfo::legalizeFrint(
1357 MachineInstr &MI, MachineRegisterInfo &MRI,
1358 MachineIRBuilder &B) const {
1361 Register Src = MI.getOperand(1).getReg();
1362 LLT Ty = MRI.getType(Src);
1363 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1365 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1366 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1368 auto C1 = B.buildFConstant(Ty, C1Val);
1369 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1371 // TODO: Should this propagate fast-math-flags?
1372 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1373 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1375 auto C2 = B.buildFConstant(Ty, C2Val);
1376 auto Fabs = B.buildFAbs(Ty, Src);
1378 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1379 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1383 bool AMDGPULegalizerInfo::legalizeFceil(
1384 MachineInstr &MI, MachineRegisterInfo &MRI,
1385 MachineIRBuilder &B) const {
1388 const LLT S1 = LLT::scalar(1);
1389 const LLT S64 = LLT::scalar(64);
1391 Register Src = MI.getOperand(1).getReg();
1392 assert(MRI.getType(Src) == S64);
1394 // result = trunc(src)
1395 // if (src > 0.0 && src != result)
1398 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1400 const auto Zero = B.buildFConstant(S64, 0.0);
1401 const auto One = B.buildFConstant(S64, 1.0);
1402 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1403 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1404 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1405 auto Add = B.buildSelect(S64, And, One, Zero);
1407 // TODO: Should this propagate fast-math-flags?
1408 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1412 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1413 MachineIRBuilder &B) {
1414 const unsigned FractBits = 52;
1415 const unsigned ExpBits = 11;
1416 LLT S32 = LLT::scalar(32);
1418 auto Const0 = B.buildConstant(S32, FractBits - 32);
1419 auto Const1 = B.buildConstant(S32, ExpBits);
1421 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1422 .addUse(Const0.getReg(0))
1423 .addUse(Const1.getReg(0));
1425 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1428 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1429 MachineInstr &MI, MachineRegisterInfo &MRI,
1430 MachineIRBuilder &B) const {
1433 const LLT S1 = LLT::scalar(1);
1434 const LLT S32 = LLT::scalar(32);
1435 const LLT S64 = LLT::scalar(64);
1437 Register Src = MI.getOperand(1).getReg();
1438 assert(MRI.getType(Src) == S64);
1440 // TODO: Should this use extract since the low half is unused?
1441 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1442 Register Hi = Unmerge.getReg(1);
1444 // Extract the upper half, since this is where we will find the sign and
1446 auto Exp = extractF64Exponent(Hi, B);
1448 const unsigned FractBits = 52;
1450 // Extract the sign bit.
1451 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1452 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1454 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1456 const auto Zero32 = B.buildConstant(S32, 0);
1458 // Extend back to 64-bits.
1459 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1461 auto Shr = B.buildAShr(S64, FractMask, Exp);
1462 auto Not = B.buildNot(S64, Shr);
1463 auto Tmp0 = B.buildAnd(S64, Src, Not);
1464 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1466 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1467 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1469 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1470 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1474 bool AMDGPULegalizerInfo::legalizeITOFP(
1475 MachineInstr &MI, MachineRegisterInfo &MRI,
1476 MachineIRBuilder &B, bool Signed) const {
1479 Register Dst = MI.getOperand(0).getReg();
1480 Register Src = MI.getOperand(1).getReg();
1482 const LLT S64 = LLT::scalar(64);
1483 const LLT S32 = LLT::scalar(32);
1485 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1487 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1489 auto CvtHi = Signed ?
1490 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1491 B.buildUITOFP(S64, Unmerge.getReg(1));
1493 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1495 auto ThirtyTwo = B.buildConstant(S32, 32);
1496 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1497 .addUse(CvtHi.getReg(0))
1498 .addUse(ThirtyTwo.getReg(0));
1500 // TODO: Should this propagate fast-math-flags?
1501 B.buildFAdd(Dst, LdExp, CvtLo);
1502 MI.eraseFromParent();
1506 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1507 MachineInstr &MI, MachineRegisterInfo &MRI,
1508 MachineIRBuilder &B) const {
1509 MachineFunction &MF = B.getMF();
1510 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1512 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1513 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1515 // With ieee_mode disabled, the instructions have the correct behavior
1516 // already for G_FMINNUM/G_FMAXNUM
1517 if (!MFI->getMode().IEEE)
1523 MachineIRBuilder HelperBuilder(MI);
1524 GISelObserverWrapper DummyObserver;
1525 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1526 HelperBuilder.setInstr(MI);
1527 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1530 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1531 MachineInstr &MI, MachineRegisterInfo &MRI,
1532 MachineIRBuilder &B) const {
1533 // TODO: Should move some of this into LegalizerHelper.
1535 // TODO: Promote dynamic indexing of s16 to s32
1536 // TODO: Dynamic s64 indexing is only legal for SGPR.
1537 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1538 if (!IdxVal) // Dynamic case will be selected to register indexing.
1541 Register Dst = MI.getOperand(0).getReg();
1542 Register Vec = MI.getOperand(1).getReg();
1544 LLT VecTy = MRI.getType(Vec);
1545 LLT EltTy = VecTy.getElementType();
1546 assert(EltTy == MRI.getType(Dst));
1550 if (IdxVal.getValue() < VecTy.getNumElements())
1551 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1555 MI.eraseFromParent();
1559 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1560 MachineInstr &MI, MachineRegisterInfo &MRI,
1561 MachineIRBuilder &B) const {
1562 // TODO: Should move some of this into LegalizerHelper.
1564 // TODO: Promote dynamic indexing of s16 to s32
1565 // TODO: Dynamic s64 indexing is only legal for SGPR.
1566 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1567 if (!IdxVal) // Dynamic case will be selected to register indexing.
1570 Register Dst = MI.getOperand(0).getReg();
1571 Register Vec = MI.getOperand(1).getReg();
1572 Register Ins = MI.getOperand(2).getReg();
1574 LLT VecTy = MRI.getType(Vec);
1575 LLT EltTy = VecTy.getElementType();
1576 assert(EltTy == MRI.getType(Ins));
1580 if (IdxVal.getValue() < VecTy.getNumElements())
1581 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1585 MI.eraseFromParent();
1589 bool AMDGPULegalizerInfo::legalizeSinCos(
1590 MachineInstr &MI, MachineRegisterInfo &MRI,
1591 MachineIRBuilder &B) const {
1594 Register DstReg = MI.getOperand(0).getReg();
1595 Register SrcReg = MI.getOperand(1).getReg();
1596 LLT Ty = MRI.getType(DstReg);
1597 unsigned Flags = MI.getFlags();
1600 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1601 if (ST.hasTrigReducedRange()) {
1602 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1603 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1604 .addUse(MulVal.getReg(0))
1605 .setMIFlags(Flags).getReg(0);
1607 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1609 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1610 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1611 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1614 MI.eraseFromParent();
1618 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1619 Register DstReg, LLT PtrTy,
1620 MachineIRBuilder &B, const GlobalValue *GV,
1621 unsigned Offset, unsigned GAFlags) const {
1622 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1623 // to the following code sequence:
1625 // For constant address space:
1626 // s_getpc_b64 s[0:1]
1627 // s_add_u32 s0, s0, $symbol
1628 // s_addc_u32 s1, s1, 0
1630 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1631 // a fixup or relocation is emitted to replace $symbol with a literal
1632 // constant, which is a pc-relative offset from the encoding of the $symbol
1633 // operand to the global variable.
1635 // For global address space:
1636 // s_getpc_b64 s[0:1]
1637 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1638 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1640 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1641 // fixups or relocations are emitted to replace $symbol@*@lo and
1642 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1643 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1644 // operand to the global variable.
1646 // What we want here is an offset from the value returned by s_getpc
1647 // (which is the address of the s_add_u32 instruction) to the global
1648 // variable, but since the encoding of $symbol starts 4 bytes after the start
1649 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1650 // small. This requires us to add 4 to the global variable offset in order to
1651 // compute the correct address.
1653 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1655 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1656 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1658 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1661 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1662 if (GAFlags == SIInstrInfo::MO_NONE)
1665 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1667 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1669 if (PtrTy.getSizeInBits() == 32)
1670 B.buildExtract(DstReg, PCReg, 0);
1674 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1675 MachineInstr &MI, MachineRegisterInfo &MRI,
1676 MachineIRBuilder &B) const {
1677 Register DstReg = MI.getOperand(0).getReg();
1678 LLT Ty = MRI.getType(DstReg);
1679 unsigned AS = Ty.getAddressSpace();
1681 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1682 MachineFunction &MF = B.getMF();
1683 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1686 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1687 if (!MFI->isEntryFunction()) {
1688 const Function &Fn = MF.getFunction();
1689 DiagnosticInfoUnsupported BadLDSDecl(
1690 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1691 Fn.getContext().diagnose(BadLDSDecl);
1694 // TODO: We could emit code to handle the initialization somewhere.
1695 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1696 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1697 MI.eraseFromParent();
1701 const Function &Fn = MF.getFunction();
1702 DiagnosticInfoUnsupported BadInit(
1703 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1704 Fn.getContext().diagnose(BadInit);
1708 const SITargetLowering *TLI = ST.getTargetLowering();
1710 if (TLI->shouldEmitFixup(GV)) {
1711 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1712 MI.eraseFromParent();
1716 if (TLI->shouldEmitPCReloc(GV)) {
1717 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1718 MI.eraseFromParent();
1722 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1723 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1725 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1726 MachinePointerInfo::getGOT(MF),
1727 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1728 MachineMemOperand::MOInvariant,
1729 8 /*Size*/, 8 /*Align*/);
1731 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1733 if (Ty.getSizeInBits() == 32) {
1734 // Truncate if this is a 32-bit constant adrdess.
1735 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1736 B.buildExtract(DstReg, Load, 0);
1738 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1740 MI.eraseFromParent();
1744 bool AMDGPULegalizerInfo::legalizeLoad(
1745 MachineInstr &MI, MachineRegisterInfo &MRI,
1746 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1748 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1749 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1750 Observer.changingInstr(MI);
1751 MI.getOperand(1).setReg(Cast.getReg(0));
1752 Observer.changedInstr(MI);
1756 bool AMDGPULegalizerInfo::legalizeFMad(
1757 MachineInstr &MI, MachineRegisterInfo &MRI,
1758 MachineIRBuilder &B) const {
1759 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1760 assert(Ty.isScalar());
1762 MachineFunction &MF = B.getMF();
1763 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1765 // TODO: Always legal with future ftz flag.
1766 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1768 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1772 MachineIRBuilder HelperBuilder(MI);
1773 GISelObserverWrapper DummyObserver;
1774 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1775 HelperBuilder.setMBB(*MI.getParent());
1776 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1779 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1780 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1781 Register DstReg = MI.getOperand(0).getReg();
1782 Register PtrReg = MI.getOperand(1).getReg();
1783 Register CmpVal = MI.getOperand(2).getReg();
1784 Register NewVal = MI.getOperand(3).getReg();
1786 assert(SITargetLowering::isFlatGlobalAddrSpace(
1787 MRI.getType(PtrReg).getAddressSpace()) &&
1788 "this should not have been custom lowered");
1790 LLT ValTy = MRI.getType(CmpVal);
1791 LLT VecTy = LLT::vector(2, ValTy);
1794 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1796 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1800 .setMemRefs(MI.memoperands());
1802 MI.eraseFromParent();
1806 // Return the use branch instruction, otherwise null if the usage is invalid.
1807 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1808 MachineRegisterInfo &MRI,
1809 MachineInstr *&Br) {
1810 Register CondDef = MI.getOperand(0).getReg();
1811 if (!MRI.hasOneNonDBGUse(CondDef))
1814 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1815 if (UseMI.getParent() != MI.getParent() ||
1816 UseMI.getOpcode() != AMDGPU::G_BRCOND)
1819 // Make sure the cond br is followed by a G_BR
1820 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1821 if (Next != MI.getParent()->end()) {
1822 if (Next->getOpcode() != AMDGPU::G_BR)
1830 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1831 Register Reg, LLT Ty) const {
1832 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1836 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1837 MRI.addLiveIn(Reg, NewReg);
1841 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1842 const ArgDescriptor *Arg) const {
1843 if (!Arg->isRegister() || !Arg->getRegister().isValid())
1844 return false; // TODO: Handle these
1846 assert(Arg->getRegister().isPhysical());
1848 MachineRegisterInfo &MRI = *B.getMRI();
1850 LLT Ty = MRI.getType(DstReg);
1851 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1853 if (Arg->isMasked()) {
1854 // TODO: Should we try to emit this once in the entry block?
1855 const LLT S32 = LLT::scalar(32);
1856 const unsigned Mask = Arg->getMask();
1857 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1859 Register AndMaskSrc = LiveIn;
1862 auto ShiftAmt = B.buildConstant(S32, Shift);
1863 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1866 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1868 B.buildCopy(DstReg, LiveIn);
1870 // Insert the argument copy if it doens't already exist.
1871 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1872 if (!MRI.getVRegDef(LiveIn)) {
1873 // FIXME: Should have scoped insert pt
1874 MachineBasicBlock &OrigInsBB = B.getMBB();
1875 auto OrigInsPt = B.getInsertPt();
1877 MachineBasicBlock &EntryMBB = B.getMF().front();
1878 EntryMBB.addLiveIn(Arg->getRegister());
1879 B.setInsertPt(EntryMBB, EntryMBB.begin());
1880 B.buildCopy(LiveIn, Arg->getRegister());
1882 B.setInsertPt(OrigInsBB, OrigInsPt);
1888 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1890 MachineRegisterInfo &MRI,
1891 MachineIRBuilder &B,
1892 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1895 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1897 const ArgDescriptor *Arg;
1898 const TargetRegisterClass *RC;
1899 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1901 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1905 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1906 MI.eraseFromParent();
1913 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1914 MachineRegisterInfo &MRI,
1915 MachineIRBuilder &B) const {
1917 Register Dst = MI.getOperand(0).getReg();
1918 LLT DstTy = MRI.getType(Dst);
1919 LLT S16 = LLT::scalar(16);
1920 LLT S32 = LLT::scalar(32);
1921 LLT S64 = LLT::scalar(64);
1923 if (legalizeFastUnsafeFDIV(MI, MRI, B))
1927 return legalizeFDIV16(MI, MRI, B);
1929 return legalizeFDIV32(MI, MRI, B);
1931 return legalizeFDIV64(MI, MRI, B);
1936 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1937 MachineRegisterInfo &MRI,
1938 MachineIRBuilder &B) const {
1939 Register Res = MI.getOperand(0).getReg();
1940 Register LHS = MI.getOperand(1).getReg();
1941 Register RHS = MI.getOperand(2).getReg();
1943 uint16_t Flags = MI.getFlags();
1945 LLT ResTy = MRI.getType(Res);
1946 LLT S32 = LLT::scalar(32);
1947 LLT S64 = LLT::scalar(64);
1949 const MachineFunction &MF = B.getMF();
1951 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1953 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1956 if (!Unsafe && ResTy == S32 &&
1957 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1960 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1962 if (CLHS->isExactlyValue(1.0)) {
1963 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1967 MI.eraseFromParent();
1971 // -1 / x -> RCP( FNEG(x) )
1972 if (CLHS->isExactlyValue(-1.0)) {
1973 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1974 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1975 .addUse(FNeg.getReg(0))
1978 MI.eraseFromParent();
1983 // x / y -> x * (1.0 / y)
1985 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1988 B.buildFMul(Res, LHS, RCP, Flags);
1990 MI.eraseFromParent();
1997 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1998 MachineRegisterInfo &MRI,
1999 MachineIRBuilder &B) const {
2001 Register Res = MI.getOperand(0).getReg();
2002 Register LHS = MI.getOperand(1).getReg();
2003 Register RHS = MI.getOperand(2).getReg();
2005 uint16_t Flags = MI.getFlags();
2007 LLT S16 = LLT::scalar(16);
2008 LLT S32 = LLT::scalar(32);
2010 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2011 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2013 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2014 .addUse(RHSExt.getReg(0))
2017 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2018 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2020 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2021 .addUse(RDst.getReg(0))
2026 MI.eraseFromParent();
2030 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2031 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2032 static void toggleSPDenormMode(bool Enable,
2033 MachineIRBuilder &B,
2034 const GCNSubtarget &ST,
2035 AMDGPU::SIModeRegisterDefaults Mode) {
2036 // Set SP denorm mode to this value.
2037 unsigned SPDenormMode =
2038 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2040 if (ST.hasDenormModeInst()) {
2041 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2042 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2043 ? FP_DENORM_FLUSH_NONE
2044 : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2046 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2047 B.buildInstr(AMDGPU::S_DENORM_MODE)
2048 .addImm(NewDenormModeValue);
2051 // Select FP32 bit field in mode register.
2052 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2053 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2054 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2056 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2057 .addImm(SPDenormMode)
2058 .addImm(SPDenormModeBitField);
2062 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2063 MachineRegisterInfo &MRI,
2064 MachineIRBuilder &B) const {
2066 Register Res = MI.getOperand(0).getReg();
2067 Register LHS = MI.getOperand(1).getReg();
2068 Register RHS = MI.getOperand(2).getReg();
2069 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2070 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2072 uint16_t Flags = MI.getFlags();
2074 LLT S32 = LLT::scalar(32);
2075 LLT S1 = LLT::scalar(1);
2077 auto One = B.buildFConstant(S32, 1.0f);
2079 auto DenominatorScaled =
2080 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2085 auto NumeratorScaled =
2086 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2092 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2093 .addUse(DenominatorScaled.getReg(0))
2095 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2097 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2098 // aren't modeled as reading it.
2099 if (!Mode.FP32Denormals)
2100 toggleSPDenormMode(true, B, ST, Mode);
2102 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2103 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2104 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2105 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2106 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2107 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2109 if (!Mode.FP32Denormals)
2110 toggleSPDenormMode(false, B, ST, Mode);
2112 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2113 .addUse(Fma4.getReg(0))
2114 .addUse(Fma1.getReg(0))
2115 .addUse(Fma3.getReg(0))
2116 .addUse(NumeratorScaled.getReg(1))
2119 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2120 .addUse(Fmas.getReg(0))
2125 MI.eraseFromParent();
2129 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2130 MachineRegisterInfo &MRI,
2131 MachineIRBuilder &B) const {
2133 Register Res = MI.getOperand(0).getReg();
2134 Register LHS = MI.getOperand(1).getReg();
2135 Register RHS = MI.getOperand(2).getReg();
2137 uint16_t Flags = MI.getFlags();
2139 LLT S64 = LLT::scalar(64);
2140 LLT S1 = LLT::scalar(1);
2142 auto One = B.buildFConstant(S64, 1.0);
2144 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2150 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2152 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2153 .addUse(DivScale0.getReg(0))
2156 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2157 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2158 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2160 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2166 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2167 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2168 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2171 if (!ST.hasUsableDivScaleConditionOutput()) {
2172 // Workaround a hardware bug on SI where the condition output from div_scale
2175 Scale = MRI.createGenericVirtualRegister(S1);
2177 LLT S32 = LLT::scalar(32);
2179 auto NumUnmerge = B.buildUnmerge(S32, LHS);
2180 auto DenUnmerge = B.buildUnmerge(S32, RHS);
2181 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2182 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2184 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2185 Scale1Unmerge.getReg(1));
2186 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2187 Scale0Unmerge.getReg(1));
2188 B.buildXor(Scale, CmpNum, CmpDen);
2190 Scale = DivScale1.getReg(1);
2193 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2194 .addUse(Fma4.getReg(0))
2195 .addUse(Fma3.getReg(0))
2196 .addUse(Mul.getReg(0))
2200 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2201 .addUse(Fmas.getReg(0))
2206 MI.eraseFromParent();
2210 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2211 MachineRegisterInfo &MRI,
2212 MachineIRBuilder &B) const {
2214 Register Res = MI.getOperand(0).getReg();
2215 Register LHS = MI.getOperand(2).getReg();
2216 Register RHS = MI.getOperand(3).getReg();
2217 uint16_t Flags = MI.getFlags();
2219 LLT S32 = LLT::scalar(32);
2220 LLT S1 = LLT::scalar(1);
2222 auto Abs = B.buildFAbs(S32, RHS, Flags);
2223 const APFloat C0Val(1.0f);
2225 auto C0 = B.buildConstant(S32, 0x6f800000);
2226 auto C1 = B.buildConstant(S32, 0x2f800000);
2227 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2229 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2230 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2232 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2234 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2235 .addUse(Mul0.getReg(0))
2238 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2240 B.buildFMul(Res, Sel, Mul1, Flags);
2242 MI.eraseFromParent();
2246 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2247 MachineRegisterInfo &MRI,
2248 MachineIRBuilder &B) const {
2249 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2250 if (!MFI->isEntryFunction()) {
2251 return legalizePreloadedArgIntrin(MI, MRI, B,
2252 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2258 ST.getTargetLowering()->getImplicitParameterOffset(
2259 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2260 Register DstReg = MI.getOperand(0).getReg();
2261 LLT DstTy = MRI.getType(DstReg);
2262 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2264 const ArgDescriptor *Arg;
2265 const TargetRegisterClass *RC;
2267 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2271 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2272 if (!loadInputValue(KernargPtrReg, B, Arg))
2275 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2276 MI.eraseFromParent();
2280 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2281 MachineRegisterInfo &MRI,
2282 MachineIRBuilder &B,
2283 unsigned AddrSpace) const {
2285 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2286 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2287 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2288 MI.eraseFromParent();
2292 /// Handle register layout difference for f16 images for some subtargets.
2293 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2294 MachineRegisterInfo &MRI,
2295 Register Reg) const {
2296 if (!ST.hasUnpackedD16VMem())
2299 const LLT S16 = LLT::scalar(16);
2300 const LLT S32 = LLT::scalar(32);
2301 LLT StoreVT = MRI.getType(Reg);
2302 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2304 auto Unmerge = B.buildUnmerge(S16, Reg);
2306 SmallVector<Register, 4> WideRegs;
2307 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2308 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2310 int NumElts = StoreVT.getNumElements();
2312 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2315 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2316 MachineRegisterInfo &MRI,
2317 MachineIRBuilder &B,
2318 bool IsFormat) const {
2319 // TODO: Reject f16 format on targets where unsupported.
2320 Register VData = MI.getOperand(1).getReg();
2321 LLT Ty = MRI.getType(VData);
2325 const LLT S32 = LLT::scalar(32);
2326 const LLT S16 = LLT::scalar(16);
2328 // Fixup illegal register types for i8 stores.
2329 if (Ty == LLT::scalar(8) || Ty == S16) {
2330 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2331 MI.getOperand(1).setReg(AnyExt);
2335 if (Ty.isVector()) {
2336 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2338 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2342 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2348 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2349 MachineRegisterInfo &MRI,
2350 MachineIRBuilder &B) const {
2351 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2352 auto IntrID = MI.getIntrinsicID();
2354 case Intrinsic::amdgcn_if:
2355 case Intrinsic::amdgcn_else: {
2356 MachineInstr *Br = nullptr;
2357 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2358 const SIRegisterInfo *TRI
2359 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2361 B.setInstr(*BrCond);
2362 Register Def = MI.getOperand(1).getReg();
2363 Register Use = MI.getOperand(3).getReg();
2365 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2367 BrTarget = Br->getOperand(0).getMBB();
2369 if (IntrID == Intrinsic::amdgcn_if) {
2370 B.buildInstr(AMDGPU::SI_IF)
2375 B.buildInstr(AMDGPU::SI_ELSE)
2383 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2385 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2386 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2387 MI.eraseFromParent();
2388 BrCond->eraseFromParent();
2394 case Intrinsic::amdgcn_loop: {
2395 MachineInstr *Br = nullptr;
2396 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2397 const SIRegisterInfo *TRI
2398 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2400 B.setInstr(*BrCond);
2402 // FIXME: Need to adjust branch targets based on unconditional branch.
2403 Register Reg = MI.getOperand(2).getReg();
2404 B.buildInstr(AMDGPU::SI_LOOP)
2406 .addMBB(BrCond->getOperand(1).getMBB());
2407 MI.eraseFromParent();
2408 BrCond->eraseFromParent();
2409 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2415 case Intrinsic::amdgcn_kernarg_segment_ptr:
2416 return legalizePreloadedArgIntrin(
2417 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2418 case Intrinsic::amdgcn_implicitarg_ptr:
2419 return legalizeImplicitArgPtr(MI, MRI, B);
2420 case Intrinsic::amdgcn_workitem_id_x:
2421 return legalizePreloadedArgIntrin(MI, MRI, B,
2422 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2423 case Intrinsic::amdgcn_workitem_id_y:
2424 return legalizePreloadedArgIntrin(MI, MRI, B,
2425 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2426 case Intrinsic::amdgcn_workitem_id_z:
2427 return legalizePreloadedArgIntrin(MI, MRI, B,
2428 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2429 case Intrinsic::amdgcn_workgroup_id_x:
2430 return legalizePreloadedArgIntrin(MI, MRI, B,
2431 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2432 case Intrinsic::amdgcn_workgroup_id_y:
2433 return legalizePreloadedArgIntrin(MI, MRI, B,
2434 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2435 case Intrinsic::amdgcn_workgroup_id_z:
2436 return legalizePreloadedArgIntrin(MI, MRI, B,
2437 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2438 case Intrinsic::amdgcn_dispatch_ptr:
2439 return legalizePreloadedArgIntrin(MI, MRI, B,
2440 AMDGPUFunctionArgInfo::DISPATCH_PTR);
2441 case Intrinsic::amdgcn_queue_ptr:
2442 return legalizePreloadedArgIntrin(MI, MRI, B,
2443 AMDGPUFunctionArgInfo::QUEUE_PTR);
2444 case Intrinsic::amdgcn_implicit_buffer_ptr:
2445 return legalizePreloadedArgIntrin(
2446 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2447 case Intrinsic::amdgcn_dispatch_id:
2448 return legalizePreloadedArgIntrin(MI, MRI, B,
2449 AMDGPUFunctionArgInfo::DISPATCH_ID);
2450 case Intrinsic::amdgcn_fdiv_fast:
2451 return legalizeFDIVFastIntrin(MI, MRI, B);
2452 case Intrinsic::amdgcn_is_shared:
2453 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2454 case Intrinsic::amdgcn_is_private:
2455 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2456 case Intrinsic::amdgcn_wavefrontsize: {
2458 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2459 MI.eraseFromParent();
2462 case Intrinsic::amdgcn_raw_buffer_store:
2463 return legalizeRawBufferStore(MI, MRI, B, false);
2464 case Intrinsic::amdgcn_raw_buffer_store_format:
2465 return legalizeRawBufferStore(MI, MRI, B, true);