contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

   1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the Machinelegalizer class for
  10 /// AMDGPU.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPU.h"
  15 #include "AMDGPULegalizerInfo.h"
  16 #include "AMDGPUTargetMachine.h"
  17 #include "SIMachineFunctionInfo.h"
  18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  20 #include "llvm/CodeGen/TargetOpcodes.h"
  21 #include "llvm/CodeGen/ValueTypes.h"
  22 #include "llvm/IR/DerivedTypes.h"
  23 #include "llvm/IR/Type.h"
  24 #include "llvm/Support/Debug.h"
  25
  26 #define DEBUG_TYPE "amdgpu-legalinfo"
  27
  28 using namespace llvm;
  29 using namespace LegalizeActions;
  30 using namespace LegalizeMutations;
  31 using namespace LegalityPredicates;
  32
  33
  34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
  35                                       unsigned MaxSize = 512) {
  36   return [=](const LegalityQuery &Query) {
  37     const LLT Ty = Query.Types[TypeIdx];
  38     const LLT EltTy = Ty.getScalarType();
  39     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
  40   };
  41 }
  42
  43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
  44   return [=](const LegalityQuery &Query) {
  45     const LLT Ty = Query.Types[TypeIdx];
  46     return Ty.isVector() &&
  47            Ty.getNumElements() % 2 != 0 &&
  48            Ty.getElementType().getSizeInBits() < 32;
  49   };
  50 }
  51
  52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
  53   return [=](const LegalityQuery &Query) {
  54     const LLT Ty = Query.Types[TypeIdx];
  55     const LLT EltTy = Ty.getElementType();
  56     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
  57   };
  58 }
  59
  60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
  61   return [=](const LegalityQuery &Query) {
  62     const LLT Ty = Query.Types[TypeIdx];
  63     const LLT EltTy = Ty.getElementType();
  64     unsigned Size = Ty.getSizeInBits();
  65     unsigned Pieces = (Size + 63) / 64;
  66     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
  67     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
  68   };
  69 }
  70
  71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
  72   return [=](const LegalityQuery &Query) {
  73     const LLT QueryTy = Query.Types[TypeIdx];
  74     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
  75   };
  76 }
  77
  78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
  79   return [=](const LegalityQuery &Query) {
  80     const LLT QueryTy = Query.Types[TypeIdx];
  81     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
  82   };
  83 }
  84
  85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
  86 // v2s16.
  87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
  88   return [=](const LegalityQuery &Query) {
  89     const LLT Ty = Query.Types[TypeIdx];
  90     if (Ty.isVector()) {
  91       const int EltSize = Ty.getElementType().getSizeInBits();
  92       return EltSize == 32 || EltSize == 64 ||
  93             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
  94              EltSize == 128 || EltSize == 256;
  95     }
  96
  97     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
  98   };
  99 }
 100
 101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 102                                          const GCNTargetMachine &TM)
 103   :  ST(ST_) {
 104   using namespace TargetOpcode;
 105
 106   auto GetAddrSpacePtr = [&TM](unsigned AS) {
 107     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
 108   };
 109
 110   const LLT S1 = LLT::scalar(1);
 111   const LLT S8 = LLT::scalar(8);
 112   const LLT S16 = LLT::scalar(16);
 113   const LLT S32 = LLT::scalar(32);
 114   const LLT S64 = LLT::scalar(64);
 115   const LLT S128 = LLT::scalar(128);
 116   const LLT S256 = LLT::scalar(256);
 117   const LLT S512 = LLT::scalar(512);
 118
 119   const LLT V2S16 = LLT::vector(2, 16);
 120   const LLT V4S16 = LLT::vector(4, 16);
 121
 122   const LLT V2S32 = LLT::vector(2, 32);
 123   const LLT V3S32 = LLT::vector(3, 32);
 124   const LLT V4S32 = LLT::vector(4, 32);
 125   const LLT V5S32 = LLT::vector(5, 32);
 126   const LLT V6S32 = LLT::vector(6, 32);
 127   const LLT V7S32 = LLT::vector(7, 32);
 128   const LLT V8S32 = LLT::vector(8, 32);
 129   const LLT V9S32 = LLT::vector(9, 32);
 130   const LLT V10S32 = LLT::vector(10, 32);
 131   const LLT V11S32 = LLT::vector(11, 32);
 132   const LLT V12S32 = LLT::vector(12, 32);
 133   const LLT V13S32 = LLT::vector(13, 32);
 134   const LLT V14S32 = LLT::vector(14, 32);
 135   const LLT V15S32 = LLT::vector(15, 32);
 136   const LLT V16S32 = LLT::vector(16, 32);
 137
 138   const LLT V2S64 = LLT::vector(2, 64);
 139   const LLT V3S64 = LLT::vector(3, 64);
 140   const LLT V4S64 = LLT::vector(4, 64);
 141   const LLT V5S64 = LLT::vector(5, 64);
 142   const LLT V6S64 = LLT::vector(6, 64);
 143   const LLT V7S64 = LLT::vector(7, 64);
 144   const LLT V8S64 = LLT::vector(8, 64);
 145
 146   std::initializer_list<LLT> AllS32Vectors =
 147     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
 148      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
 149   std::initializer_list<LLT> AllS64Vectors =
 150     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
 151
 152   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
 153   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
 154   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
 155   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
 156   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 157
 158   const LLT CodePtr = FlatPtr;
 159
 160   const std::initializer_list<LLT> AddrSpaces64 = {
 161     GlobalPtr, ConstantPtr, FlatPtr
 162   };
 163
 164   const std::initializer_list<LLT> AddrSpaces32 = {
 165     LocalPtr, PrivatePtr
 166   };
 167
 168   const std::initializer_list<LLT> FPTypesBase = {
 169     S32, S64
 170   };
 171
 172   const std::initializer_list<LLT> FPTypes16 = {
 173     S32, S64, S16
 174   };
 175
 176   const std::initializer_list<LLT> FPTypesPK16 = {
 177     S32, S64, S16, V2S16
 178   };
 179
 180   setAction({G_BRCOND, S1}, Legal);
 181
 182   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
 183   // elements for v3s16
 184   getActionDefinitionsBuilder(G_PHI)
 185     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
 186     .legalFor(AllS32Vectors)
 187     .legalFor(AllS64Vectors)
 188     .legalFor(AddrSpaces64)
 189     .legalFor(AddrSpaces32)
 190     .clampScalar(0, S32, S256)
 191     .widenScalarToNextPow2(0, 32)
 192     .clampMaxNumElements(0, S32, 16)
 193     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 194     .legalIf(isPointer(0));
 195
 196   if (ST.has16BitInsts()) {
 197     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 198       .legalFor({S32, S16})
 199       .clampScalar(0, S16, S32)
 200       .scalarize(0);
 201   } else {
 202     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 203       .legalFor({S32})
 204       .clampScalar(0, S32, S32)
 205       .scalarize(0);
 206   }
 207
 208   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
 209     .legalFor({S32})
 210     .clampScalar(0, S32, S32)
 211     .scalarize(0);
 212
 213   // Report legal for any types we can handle anywhere. For the cases only legal
 214   // on the SALU, RegBankSelect will be able to re-legalize.
 215   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
 216     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
 217     .clampScalar(0, S32, S64)
 218     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 219     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
 220     .widenScalarToNextPow2(0)
 221     .scalarize(0);
 222
 223   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
 224                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
 225     .legalFor({{S32, S1}})
 226     .clampScalar(0, S32, S32);
 227
 228   getActionDefinitionsBuilder(G_BITCAST)
 229     .legalForCartesianProduct({S32, V2S16})
 230     .legalForCartesianProduct({S64, V2S32, V4S16})
 231     .legalForCartesianProduct({V2S64, V4S32})
 232     // Don't worry about the size constraint.
 233     .legalIf(all(isPointer(0), isPointer(1)));
 234
 235   if (ST.has16BitInsts()) {
 236     getActionDefinitionsBuilder(G_FCONSTANT)
 237       .legalFor({S32, S64, S16})
 238       .clampScalar(0, S16, S64);
 239   } else {
 240     getActionDefinitionsBuilder(G_FCONSTANT)
 241       .legalFor({S32, S64})
 242       .clampScalar(0, S32, S64);
 243   }
 244
 245   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
 246     .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
 247                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
 248     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 249     .clampScalarOrElt(0, S32, S512)
 250     .legalIf(isMultiple32(0))
 251     .widenScalarToNextPow2(0, 32)
 252     .clampMaxNumElements(0, S32, 16);
 253
 254
 255   // FIXME: i1 operands to intrinsics should always be legal, but other i1
 256   // values may not be legal.  We need to figure out how to distinguish
 257   // between these two scenarios.
 258   getActionDefinitionsBuilder(G_CONSTANT)
 259     .legalFor({S1, S32, S64, GlobalPtr,
 260                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
 261     .clampScalar(0, S32, S64)
 262     .widenScalarToNextPow2(0)
 263     .legalIf(isPointer(0));
 264
 265   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
 266
 267   auto &FPOpActions = getActionDefinitionsBuilder(
 268     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
 269     .legalFor({S32, S64});
 270
 271   if (ST.has16BitInsts()) {
 272     if (ST.hasVOP3PInsts())
 273       FPOpActions.legalFor({S16, V2S16});
 274     else
 275       FPOpActions.legalFor({S16});
 276   }
 277
 278   auto &MinNumMaxNum = getActionDefinitionsBuilder({
 279       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
 280
 281   if (ST.hasVOP3PInsts()) {
 282     MinNumMaxNum.customFor(FPTypesPK16)
 283       .clampMaxNumElements(0, S16, 2)
 284       .clampScalar(0, S16, S64)
 285       .scalarize(0);
 286   } else if (ST.has16BitInsts()) {
 287     MinNumMaxNum.customFor(FPTypes16)
 288       .clampScalar(0, S16, S64)
 289       .scalarize(0);
 290   } else {
 291     MinNumMaxNum.customFor(FPTypesBase)
 292       .clampScalar(0, S32, S64)
 293       .scalarize(0);
 294   }
 295
 296   // TODO: Implement
 297   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
 298
 299   if (ST.hasVOP3PInsts())
 300     FPOpActions.clampMaxNumElements(0, S16, 2);
 301   FPOpActions
 302     .scalarize(0)
 303     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 304
 305   if (ST.has16BitInsts()) {
 306     getActionDefinitionsBuilder(G_FSQRT)
 307       .legalFor({S32, S64, S16})
 308       .scalarize(0)
 309       .clampScalar(0, S16, S64);
 310   } else {
 311     getActionDefinitionsBuilder(G_FSQRT)
 312       .legalFor({S32, S64})
 313       .scalarize(0)
 314       .clampScalar(0, S32, S64);
 315   }
 316
 317   getActionDefinitionsBuilder(G_FPTRUNC)
 318     .legalFor({{S32, S64}, {S16, S32}})
 319     .scalarize(0);
 320
 321   getActionDefinitionsBuilder(G_FPEXT)
 322     .legalFor({{S64, S32}, {S32, S16}})
 323     .lowerFor({{S64, S16}}) // FIXME: Implement
 324     .scalarize(0);
 325
 326   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
 327   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
 328
 329   getActionDefinitionsBuilder(G_FSUB)
 330       // Use actual fsub instruction
 331       .legalFor({S32})
 332       // Must use fadd + fneg
 333       .lowerFor({S64, S16, V2S16})
 334       .scalarize(0)
 335       .clampScalar(0, S32, S64);
 336
 337   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
 338     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
 339                {S32, S1}, {S64, S1}, {S16, S1},
 340                // FIXME: Hack
 341                {S64, LLT::scalar(33)},
 342                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
 343     .scalarize(0);
 344
 345   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
 346     .legalFor({{S32, S32}, {S64, S32}})
 347     .lowerFor({{S32, S64}})
 348     .customFor({{S64, S64}})
 349     .scalarize(0);
 350
 351   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
 352     .legalFor({{S32, S32}, {S32, S64}})
 353     .scalarize(0);
 354
 355   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
 356     .legalFor({S32, S64})
 357     .scalarize(0);
 358
 359   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
 360     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 361       .legalFor({S32, S64})
 362       .clampScalar(0, S32, S64)
 363       .scalarize(0);
 364   } else {
 365     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 366       .legalFor({S32})
 367       .customFor({S64})
 368       .clampScalar(0, S32, S64)
 369       .scalarize(0);
 370   }
 371
 372   getActionDefinitionsBuilder(G_GEP)
 373     .legalForCartesianProduct(AddrSpaces64, {S64})
 374     .legalForCartesianProduct(AddrSpaces32, {S32})
 375     .scalarize(0);
 376
 377   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 378
 379   auto &CmpBuilder =
 380     getActionDefinitionsBuilder(G_ICMP)
 381     .legalForCartesianProduct(
 382       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
 383     .legalFor({{S1, S32}, {S1, S64}});
 384   if (ST.has16BitInsts()) {
 385     CmpBuilder.legalFor({{S1, S16}});
 386   }
 387
 388   CmpBuilder
 389     .widenScalarToNextPow2(1)
 390     .clampScalar(1, S32, S64)
 391     .scalarize(0)
 392     .legalIf(all(typeIs(0, S1), isPointer(1)));
 393
 394   getActionDefinitionsBuilder(G_FCMP)
 395     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
 396     .widenScalarToNextPow2(1)
 397     .clampScalar(1, S32, S64)
 398     .scalarize(0);
 399
 400   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
 401   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
 402                                G_FLOG, G_FLOG2, G_FLOG10})
 403     .legalFor({S32})
 404     .scalarize(0);
 405
 406   // The 64-bit versions produce 32-bit results, but only on the SALU.
 407   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
 408                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
 409                                G_CTPOP})
 410     .legalFor({{S32, S32}, {S32, S64}})
 411     .clampScalar(0, S32, S32)
 412     .clampScalar(1, S32, S64)
 413     .scalarize(0)
 414     .widenScalarToNextPow2(0, 32)
 415     .widenScalarToNextPow2(1, 32);
 416
 417   // TODO: Expand for > s32
 418   getActionDefinitionsBuilder(G_BSWAP)
 419     .legalFor({S32})
 420     .clampScalar(0, S32, S32)
 421     .scalarize(0);
 422
 423   if (ST.has16BitInsts()) {
 424     if (ST.hasVOP3PInsts()) {
 425       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 426         .legalFor({S32, S16, V2S16})
 427         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 428         .clampMaxNumElements(0, S16, 2)
 429         .clampScalar(0, S16, S32)
 430         .widenScalarToNextPow2(0)
 431         .scalarize(0);
 432     } else {
 433       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 434         .legalFor({S32, S16})
 435         .widenScalarToNextPow2(0)
 436         .clampScalar(0, S16, S32)
 437         .scalarize(0);
 438     }
 439   } else {
 440     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 441       .legalFor({S32})
 442       .clampScalar(0, S32, S32)
 443       .widenScalarToNextPow2(0)
 444       .scalarize(0);
 445   }
 446
 447   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 448     return [=](const LegalityQuery &Query) {
 449       return Query.Types[TypeIdx0].getSizeInBits() <
 450              Query.Types[TypeIdx1].getSizeInBits();
 451     };
 452   };
 453
 454   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
 455     return [=](const LegalityQuery &Query) {
 456       return Query.Types[TypeIdx0].getSizeInBits() >
 457              Query.Types[TypeIdx1].getSizeInBits();
 458     };
 459   };
 460
 461   getActionDefinitionsBuilder(G_INTTOPTR)
 462     // List the common cases
 463     .legalForCartesianProduct(AddrSpaces64, {S64})
 464     .legalForCartesianProduct(AddrSpaces32, {S32})
 465     .scalarize(0)
 466     // Accept any address space as long as the size matches
 467     .legalIf(sameSize(0, 1))
 468     .widenScalarIf(smallerThan(1, 0),
 469       [](const LegalityQuery &Query) {
 470         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 471       })
 472     .narrowScalarIf(greaterThan(1, 0),
 473       [](const LegalityQuery &Query) {
 474         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 475       });
 476
 477   getActionDefinitionsBuilder(G_PTRTOINT)
 478     // List the common cases
 479     .legalForCartesianProduct(AddrSpaces64, {S64})
 480     .legalForCartesianProduct(AddrSpaces32, {S32})
 481     .scalarize(0)
 482     // Accept any address space as long as the size matches
 483     .legalIf(sameSize(0, 1))
 484     .widenScalarIf(smallerThan(0, 1),
 485       [](const LegalityQuery &Query) {
 486         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 487       })
 488     .narrowScalarIf(
 489       greaterThan(0, 1),
 490       [](const LegalityQuery &Query) {
 491         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 492       });
 493
 494   if (ST.hasFlatAddressSpace()) {
 495     getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
 496       .scalarize(0)
 497       .custom();
 498   }
 499
 500   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 501   // handle some operations by just promoting the register during
 502   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
 503   getActionDefinitionsBuilder({G_LOAD, G_STORE})
 504     .narrowScalarIf([](const LegalityQuery &Query) {
 505         unsigned Size = Query.Types[0].getSizeInBits();
 506         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 507         return (Size > 32 && MemSize < Size);
 508       },
 509       [](const LegalityQuery &Query) {
 510         return std::make_pair(0, LLT::scalar(32));
 511       })
 512     .fewerElementsIf([=](const LegalityQuery &Query) {
 513         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 514         return (MemSize == 96) &&
 515                Query.Types[0].isVector() &&
 516                !ST.hasDwordx3LoadStores();
 517       },
 518       [=](const LegalityQuery &Query) {
 519         return std::make_pair(0, V2S32);
 520       })
 521     .legalIf([=](const LegalityQuery &Query) {
 522         const LLT &Ty0 = Query.Types[0];
 523
 524         unsigned Size = Ty0.getSizeInBits();
 525         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 526         if (Size < 32 || (Size > 32 && MemSize < Size))
 527           return false;
 528
 529         if (Ty0.isVector() && Size != MemSize)
 530           return false;
 531
 532         // TODO: Decompose private loads into 4-byte components.
 533         // TODO: Illegal flat loads on SI
 534         switch (MemSize) {
 535         case 8:
 536         case 16:
 537           return Size == 32;
 538         case 32:
 539         case 64:
 540         case 128:
 541           return true;
 542
 543         case 96:
 544           return ST.hasDwordx3LoadStores();
 545
 546         case 256:
 547         case 512:
 548           // TODO: Possibly support loads of i256 and i512 .  This will require
 549           // adding i256 and i512 types to MVT in order for to be able to use
 550           // TableGen.
 551           // TODO: Add support for other vector types, this will require
 552           //       defining more value mappings for the new types.
 553           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
 554                                     Ty0.getScalarType().getSizeInBits() == 64);
 555
 556         default:
 557           return false;
 558         }
 559       })
 560     .clampScalar(0, S32, S64);
 561
 562
 563   // FIXME: Handle alignment requirements.
 564   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
 565     .legalForTypesWithMemDesc({
 566         {S32, GlobalPtr, 8, 8},
 567         {S32, GlobalPtr, 16, 8},
 568         {S32, LocalPtr, 8, 8},
 569         {S32, LocalPtr, 16, 8},
 570         {S32, PrivatePtr, 8, 8},
 571         {S32, PrivatePtr, 16, 8}});
 572   if (ST.hasFlatAddressSpace()) {
 573     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
 574                                        {S32, FlatPtr, 16, 8}});
 575   }
 576
 577   ExtLoads.clampScalar(0, S32, S32)
 578           .widenScalarToNextPow2(0)
 579           .unsupportedIfMemSizeNotPow2()
 580           .lower();
 581
 582   auto &Atomics = getActionDefinitionsBuilder(
 583     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
 584      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
 585      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
 586      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
 587     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
 588                {S64, GlobalPtr}, {S64, LocalPtr}});
 589   if (ST.hasFlatAddressSpace()) {
 590     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
 591   }
 592
 593   // TODO: Pointer types, any 32-bit or 64-bit vector
 594   getActionDefinitionsBuilder(G_SELECT)
 595     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
 596           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
 597           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
 598     .clampScalar(0, S16, S64)
 599     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 600     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
 601     .scalarize(1)
 602     .clampMaxNumElements(0, S32, 2)
 603     .clampMaxNumElements(0, LocalPtr, 2)
 604     .clampMaxNumElements(0, PrivatePtr, 2)
 605     .scalarize(0)
 606     .widenScalarToNextPow2(0)
 607     .legalIf(all(isPointer(0), typeIs(1, S1)));
 608
 609   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
 610   // be more flexible with the shift amount type.
 611   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
 612     .legalFor({{S32, S32}, {S64, S32}});
 613   if (ST.has16BitInsts()) {
 614     if (ST.hasVOP3PInsts()) {
 615       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
 616             .clampMaxNumElements(0, S16, 2);
 617     } else
 618       Shifts.legalFor({{S16, S32}, {S16, S16}});
 619
 620     Shifts.clampScalar(1, S16, S32);
 621     Shifts.clampScalar(0, S16, S64);
 622     Shifts.widenScalarToNextPow2(0, 16);
 623   } else {
 624     // Make sure we legalize the shift amount type first, as the general
 625     // expansion for the shifted type will produce much worse code if it hasn't
 626     // been truncated already.
 627     Shifts.clampScalar(1, S32, S32);
 628     Shifts.clampScalar(0, S32, S64);
 629     Shifts.widenScalarToNextPow2(0, 32);
 630   }
 631   Shifts.scalarize(0);
 632
 633   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
 634     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
 635     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
 636     unsigned IdxTypeIdx = 2;
 637
 638     getActionDefinitionsBuilder(Op)
 639       .customIf([=](const LegalityQuery &Query) {
 640           const LLT EltTy = Query.Types[EltTypeIdx];
 641           const LLT VecTy = Query.Types[VecTypeIdx];
 642           const LLT IdxTy = Query.Types[IdxTypeIdx];
 643           return (EltTy.getSizeInBits() == 16 ||
 644                   EltTy.getSizeInBits() % 32 == 0) &&
 645                  VecTy.getSizeInBits() % 32 == 0 &&
 646                  VecTy.getSizeInBits() <= 512 &&
 647                  IdxTy.getSizeInBits() == 32;
 648         })
 649       .clampScalar(EltTypeIdx, S32, S64)
 650       .clampScalar(VecTypeIdx, S32, S64)
 651       .clampScalar(IdxTypeIdx, S32, S32);
 652   }
 653
 654   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
 655     .unsupportedIf([=](const LegalityQuery &Query) {
 656         const LLT &EltTy = Query.Types[1].getElementType();
 657         return Query.Types[0] != EltTy;
 658       });
 659
 660   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
 661     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
 662     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
 663
 664     // FIXME: Doesn't handle extract of illegal sizes.
 665     getActionDefinitionsBuilder(Op)
 666       .legalIf([=](const LegalityQuery &Query) {
 667           const LLT BigTy = Query.Types[BigTyIdx];
 668           const LLT LitTy = Query.Types[LitTyIdx];
 669           return (BigTy.getSizeInBits() % 32 == 0) &&
 670                  (LitTy.getSizeInBits() % 16 == 0);
 671         })
 672       .widenScalarIf(
 673         [=](const LegalityQuery &Query) {
 674           const LLT BigTy = Query.Types[BigTyIdx];
 675           return (BigTy.getScalarSizeInBits() < 16);
 676         },
 677         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
 678       .widenScalarIf(
 679         [=](const LegalityQuery &Query) {
 680           const LLT LitTy = Query.Types[LitTyIdx];
 681           return (LitTy.getScalarSizeInBits() < 16);
 682         },
 683         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
 684       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
 685       .widenScalarToNextPow2(BigTyIdx, 32);
 686
 687   }
 688
 689   getActionDefinitionsBuilder(G_BUILD_VECTOR)
 690       .legalForCartesianProduct(AllS32Vectors, {S32})
 691       .legalForCartesianProduct(AllS64Vectors, {S64})
 692       .clampNumElements(0, V16S32, V16S32)
 693       .clampNumElements(0, V2S64, V8S64)
 694       .minScalarSameAs(1, 0)
 695       .legalIf(isRegisterType(0))
 696       .minScalarOrElt(0, S32);
 697
 698   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
 699     .legalIf(isRegisterType(0));
 700
 701   // Merge/Unmerge
 702   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
 703     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
 704     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
 705
 706     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
 707       const LLT &Ty = Query.Types[TypeIdx];
 708       if (Ty.isVector()) {
 709         const LLT &EltTy = Ty.getElementType();
 710         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
 711           return true;
 712         if (!isPowerOf2_32(EltTy.getSizeInBits()))
 713           return true;
 714       }
 715       return false;
 716     };
 717
 718     getActionDefinitionsBuilder(Op)
 719       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
 720       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
 721       // worth considering the multiples of 64 since 2*192 and 2*384 are not
 722       // valid.
 723       .clampScalar(LitTyIdx, S16, S256)
 724       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
 725
 726       // Break up vectors with weird elements into scalars
 727       .fewerElementsIf(
 728         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
 729         scalarize(0))
 730       .fewerElementsIf(
 731         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
 732         scalarize(1))
 733       .clampScalar(BigTyIdx, S32, S512)
 734       .widenScalarIf(
 735         [=](const LegalityQuery &Query) {
 736           const LLT &Ty = Query.Types[BigTyIdx];
 737           return !isPowerOf2_32(Ty.getSizeInBits()) &&
 738                  Ty.getSizeInBits() % 16 != 0;
 739         },
 740         [=](const LegalityQuery &Query) {
 741           // Pick the next power of 2, or a multiple of 64 over 128.
 742           // Whichever is smaller.
 743           const LLT &Ty = Query.Types[BigTyIdx];
 744           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
 745           if (NewSizeInBits >= 256) {
 746             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
 747             if (RoundedTo < NewSizeInBits)
 748               NewSizeInBits = RoundedTo;
 749           }
 750           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
 751         })
 752       .legalIf([=](const LegalityQuery &Query) {
 753           const LLT &BigTy = Query.Types[BigTyIdx];
 754           const LLT &LitTy = Query.Types[LitTyIdx];
 755
 756           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
 757             return false;
 758           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
 759             return false;
 760
 761           return BigTy.getSizeInBits() % 16 == 0 &&
 762                  LitTy.getSizeInBits() % 16 == 0 &&
 763                  BigTy.getSizeInBits() <= 512;
 764         })
 765       // Any vectors left are the wrong size. Scalarize them.
 766       .scalarize(0)
 767       .scalarize(1);
 768   }
 769
 770   computeTables();
 771   verify(*ST.getInstrInfo());
 772 }
 773
 774 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
 775                                          MachineRegisterInfo &MRI,
 776                                          MachineIRBuilder &MIRBuilder,
 777                                          GISelChangeObserver &Observer) const {
 778   switch (MI.getOpcode()) {
 779   case TargetOpcode::G_ADDRSPACE_CAST:
 780     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
 781   case TargetOpcode::G_FRINT:
 782     return legalizeFrint(MI, MRI, MIRBuilder);
 783   case TargetOpcode::G_FCEIL:
 784     return legalizeFceil(MI, MRI, MIRBuilder);
 785   case TargetOpcode::G_INTRINSIC_TRUNC:
 786     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
 787   case TargetOpcode::G_SITOFP:
 788     return legalizeITOFP(MI, MRI, MIRBuilder, true);
 789   case TargetOpcode::G_UITOFP:
 790     return legalizeITOFP(MI, MRI, MIRBuilder, false);
 791   case TargetOpcode::G_FMINNUM:
 792   case TargetOpcode::G_FMAXNUM:
 793   case TargetOpcode::G_FMINNUM_IEEE:
 794   case TargetOpcode::G_FMAXNUM_IEEE:
 795     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
 796   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
 797     return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
 798   case TargetOpcode::G_INSERT_VECTOR_ELT:
 799     return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
 800   default:
 801     return false;
 802   }
 803
 804   llvm_unreachable("expected switch to return");
 805 }
 806
 807 Register AMDGPULegalizerInfo::getSegmentAperture(
 808   unsigned AS,
 809   MachineRegisterInfo &MRI,
 810   MachineIRBuilder &MIRBuilder) const {
 811   MachineFunction &MF = MIRBuilder.getMF();
 812   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 813   const LLT S32 = LLT::scalar(32);
 814
 815   if (ST.hasApertureRegs()) {
 816     // FIXME: Use inline constants (src_{shared, private}_base) instead of
 817     // getreg.
 818     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
 819         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
 820         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
 821     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
 822         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
 823         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
 824     unsigned Encoding =
 825         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
 826         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
 827         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
 828
 829     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
 830     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 831
 832     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
 833       .addDef(GetReg)
 834       .addImm(Encoding);
 835     MRI.setType(GetReg, S32);
 836
 837     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
 838     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
 839       .addDef(ApertureReg)
 840       .addUse(GetReg)
 841       .addUse(ShiftAmt.getReg(0));
 842
 843     return ApertureReg;
 844   }
 845
 846   Register QueuePtr = MRI.createGenericVirtualRegister(
 847     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
 848
 849   // FIXME: Placeholder until we can track the input registers.
 850   MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
 851
 852   // Offset into amd_queue_t for group_segment_aperture_base_hi /
 853   // private_segment_aperture_base_hi.
 854   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
 855
 856   // FIXME: Don't use undef
 857   Value *V = UndefValue::get(PointerType::get(
 858                                Type::getInt8Ty(MF.getFunction().getContext()),
 859                                AMDGPUAS::CONSTANT_ADDRESS));
 860
 861   MachinePointerInfo PtrInfo(V, StructOffset);
 862   MachineMemOperand *MMO = MF.getMachineMemOperand(
 863     PtrInfo,
 864     MachineMemOperand::MOLoad |
 865     MachineMemOperand::MODereferenceable |
 866     MachineMemOperand::MOInvariant,
 867     4,
 868     MinAlign(64, StructOffset));
 869
 870   Register LoadResult = MRI.createGenericVirtualRegister(S32);
 871   Register LoadAddr;
 872
 873   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
 874   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
 875   return LoadResult;
 876 }
 877
 878 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
 879   MachineInstr &MI, MachineRegisterInfo &MRI,
 880   MachineIRBuilder &MIRBuilder) const {
 881   MachineFunction &MF = MIRBuilder.getMF();
 882
 883   MIRBuilder.setInstr(MI);
 884
 885   Register Dst = MI.getOperand(0).getReg();
 886   Register Src = MI.getOperand(1).getReg();
 887
 888   LLT DstTy = MRI.getType(Dst);
 889   LLT SrcTy = MRI.getType(Src);
 890   unsigned DestAS = DstTy.getAddressSpace();
 891   unsigned SrcAS = SrcTy.getAddressSpace();
 892
 893   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
 894   // vector element.
 895   assert(!DstTy.isVector());
 896
 897   const AMDGPUTargetMachine &TM
 898     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
 899
 900   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 901   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
 902     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
 903     return true;
 904   }
 905
 906   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
 907     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
 908            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
 909     unsigned NullVal = TM.getNullPointerValue(DestAS);
 910
 911     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
 912     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
 913
 914     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
 915
 916     // Extract low 32-bits of the pointer.
 917     MIRBuilder.buildExtract(PtrLo32, Src, 0);
 918
 919     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
 920     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
 921     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
 922
 923     MI.eraseFromParent();
 924     return true;
 925   }
 926
 927   assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
 928          SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
 929
 930   auto SegmentNull =
 931       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
 932   auto FlatNull =
 933       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
 934
 935   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
 936
 937   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
 938   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
 939
 940   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
 941
 942   // Coerce the type of the low half of the result so we can use merge_values.
 943   Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
 944   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
 945     .addDef(SrcAsInt)
 946     .addUse(Src);
 947
 948   // TODO: Should we allow mismatched types but matching sizes in merges to
 949   // avoid the ptrtoint?
 950   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
 951   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
 952
 953   MI.eraseFromParent();
 954   return true;
 955 }
 956
 957 bool AMDGPULegalizerInfo::legalizeFrint(
 958   MachineInstr &MI, MachineRegisterInfo &MRI,
 959   MachineIRBuilder &MIRBuilder) const {
 960   MIRBuilder.setInstr(MI);
 961
 962   Register Src = MI.getOperand(1).getReg();
 963   LLT Ty = MRI.getType(Src);
 964   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
 965
 966   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
 967   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
 968
 969   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
 970   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
 971
 972   // TODO: Should this propagate fast-math-flags?
 973   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
 974   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
 975
 976   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
 977   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
 978
 979   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
 980   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
 981   return true;
 982 }
 983
 984 bool AMDGPULegalizerInfo::legalizeFceil(
 985   MachineInstr &MI, MachineRegisterInfo &MRI,
 986   MachineIRBuilder &B) const {
 987   B.setInstr(MI);
 988
 989   const LLT S1 = LLT::scalar(1);
 990   const LLT S64 = LLT::scalar(64);
 991
 992   Register Src = MI.getOperand(1).getReg();
 993   assert(MRI.getType(Src) == S64);
 994
 995   // result = trunc(src)
 996   // if (src > 0.0 && src != result)
 997   //   result += 1.0
 998
 999   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1000
1001   const auto Zero = B.buildFConstant(S64, 0.0);
1002   const auto One = B.buildFConstant(S64, 1.0);
1003   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1004   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1005   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1006   auto Add = B.buildSelect(S64, And, One, Zero);
1007
1008   // TODO: Should this propagate fast-math-flags?
1009   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1010   return true;
1011 }
1012
1013 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1014                                               MachineIRBuilder &B) {
1015   const unsigned FractBits = 52;
1016   const unsigned ExpBits = 11;
1017   LLT S32 = LLT::scalar(32);
1018
1019   auto Const0 = B.buildConstant(S32, FractBits - 32);
1020   auto Const1 = B.buildConstant(S32, ExpBits);
1021
1022   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1023     .addUse(Const0.getReg(0))
1024     .addUse(Const1.getReg(0));
1025
1026   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1027 }
1028
1029 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1030   MachineInstr &MI, MachineRegisterInfo &MRI,
1031   MachineIRBuilder &B) const {
1032   B.setInstr(MI);
1033
1034   const LLT S1 = LLT::scalar(1);
1035   const LLT S32 = LLT::scalar(32);
1036   const LLT S64 = LLT::scalar(64);
1037
1038   Register Src = MI.getOperand(1).getReg();
1039   assert(MRI.getType(Src) == S64);
1040
1041   // TODO: Should this use extract since the low half is unused?
1042   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1043   Register Hi = Unmerge.getReg(1);
1044
1045   // Extract the upper half, since this is where we will find the sign and
1046   // exponent.
1047   auto Exp = extractF64Exponent(Hi, B);
1048
1049   const unsigned FractBits = 52;
1050
1051   // Extract the sign bit.
1052   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1053   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1054
1055   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1056
1057   const auto Zero32 = B.buildConstant(S32, 0);
1058
1059   // Extend back to 64-bits.
1060   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1061
1062   auto Shr = B.buildAShr(S64, FractMask, Exp);
1063   auto Not = B.buildNot(S64, Shr);
1064   auto Tmp0 = B.buildAnd(S64, Src, Not);
1065   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1066
1067   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1068   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1069
1070   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1071   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1072   return true;
1073 }
1074
1075 bool AMDGPULegalizerInfo::legalizeITOFP(
1076   MachineInstr &MI, MachineRegisterInfo &MRI,
1077   MachineIRBuilder &B, bool Signed) const {
1078   B.setInstr(MI);
1079
1080   Register Dst = MI.getOperand(0).getReg();
1081   Register Src = MI.getOperand(1).getReg();
1082
1083   const LLT S64 = LLT::scalar(64);
1084   const LLT S32 = LLT::scalar(32);
1085
1086   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1087
1088   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1089
1090   auto CvtHi = Signed ?
1091     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1092     B.buildUITOFP(S64, Unmerge.getReg(1));
1093
1094   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1095
1096   auto ThirtyTwo = B.buildConstant(S32, 32);
1097   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1098     .addUse(CvtHi.getReg(0))
1099     .addUse(ThirtyTwo.getReg(0));
1100
1101   // TODO: Should this propagate fast-math-flags?
1102   B.buildFAdd(Dst, LdExp, CvtLo);
1103   MI.eraseFromParent();
1104   return true;
1105 }
1106
1107 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1108   MachineInstr &MI, MachineRegisterInfo &MRI,
1109   MachineIRBuilder &B) const {
1110   MachineFunction &MF = B.getMF();
1111   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1112
1113   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1114                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1115
1116   // With ieee_mode disabled, the instructions have the correct behavior
1117   // already for G_FMINNUM/G_FMAXNUM
1118   if (!MFI->getMode().IEEE)
1119     return !IsIEEEOp;
1120
1121   if (IsIEEEOp)
1122     return true;
1123
1124   MachineIRBuilder HelperBuilder(MI);
1125   GISelObserverWrapper DummyObserver;
1126   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1127   HelperBuilder.setMBB(*MI.getParent());
1128   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1129 }
1130
1131 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1132   MachineInstr &MI, MachineRegisterInfo &MRI,
1133   MachineIRBuilder &B) const {
1134   // TODO: Should move some of this into LegalizerHelper.
1135
1136   // TODO: Promote dynamic indexing of s16 to s32
1137   // TODO: Dynamic s64 indexing is only legal for SGPR.
1138   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1139   if (!IdxVal) // Dynamic case will be selected to register indexing.
1140     return true;
1141
1142   Register Dst = MI.getOperand(0).getReg();
1143   Register Vec = MI.getOperand(1).getReg();
1144
1145   LLT VecTy = MRI.getType(Vec);
1146   LLT EltTy = VecTy.getElementType();
1147   assert(EltTy == MRI.getType(Dst));
1148
1149   B.setInstr(MI);
1150
1151   if (IdxVal.getValue() < VecTy.getNumElements())
1152     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1153   else
1154     B.buildUndef(Dst);
1155
1156   MI.eraseFromParent();
1157   return true;
1158 }
1159
1160 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1161   MachineInstr &MI, MachineRegisterInfo &MRI,
1162   MachineIRBuilder &B) const {
1163   // TODO: Should move some of this into LegalizerHelper.
1164
1165   // TODO: Promote dynamic indexing of s16 to s32
1166   // TODO: Dynamic s64 indexing is only legal for SGPR.
1167   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1168   if (!IdxVal) // Dynamic case will be selected to register indexing.
1169     return true;
1170
1171   Register Dst = MI.getOperand(0).getReg();
1172   Register Vec = MI.getOperand(1).getReg();
1173   Register Ins = MI.getOperand(2).getReg();
1174
1175   LLT VecTy = MRI.getType(Vec);
1176   LLT EltTy = VecTy.getElementType();
1177   assert(EltTy == MRI.getType(Ins));
1178
1179   B.setInstr(MI);
1180
1181   if (IdxVal.getValue() < VecTy.getNumElements())
1182     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1183   else
1184     B.buildUndef(Dst);
1185
1186   MI.eraseFromParent();
1187   return true;
1188 }
1189
1190 // Return the use branch instruction, otherwise null if the usage is invalid.
1191 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1192                                        MachineRegisterInfo &MRI) {
1193   Register CondDef = MI.getOperand(0).getReg();
1194   if (!MRI.hasOneNonDBGUse(CondDef))
1195     return nullptr;
1196
1197   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1198   return UseMI.getParent() == MI.getParent() &&
1199     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1200 }
1201
1202 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1203                                                 Register Reg, LLT Ty) const {
1204   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1205   if (LiveIn)
1206     return LiveIn;
1207
1208   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1209   MRI.addLiveIn(Reg, NewReg);
1210   return NewReg;
1211 }
1212
1213 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1214                                          const ArgDescriptor *Arg) const {
1215   if (!Arg->isRegister())
1216     return false; // TODO: Handle these
1217
1218   assert(Arg->getRegister() != 0);
1219   assert(Arg->getRegister().isPhysical());
1220
1221   MachineRegisterInfo &MRI = *B.getMRI();
1222
1223   LLT Ty = MRI.getType(DstReg);
1224   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1225
1226   if (Arg->isMasked()) {
1227     // TODO: Should we try to emit this once in the entry block?
1228     const LLT S32 = LLT::scalar(32);
1229     const unsigned Mask = Arg->getMask();
1230     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1231
1232     auto ShiftAmt = B.buildConstant(S32, Shift);
1233     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1234     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1235   } else
1236     B.buildCopy(DstReg, LiveIn);
1237
1238   // Insert the argument copy if it doens't already exist.
1239   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1240   if (!MRI.getVRegDef(LiveIn)) {
1241     MachineBasicBlock &EntryMBB = B.getMF().front();
1242     EntryMBB.addLiveIn(Arg->getRegister());
1243     B.setInsertPt(EntryMBB, EntryMBB.begin());
1244     B.buildCopy(LiveIn, Arg->getRegister());
1245   }
1246
1247   return true;
1248 }
1249
1250 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1251   MachineInstr &MI,
1252   MachineRegisterInfo &MRI,
1253   MachineIRBuilder &B,
1254   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1255   B.setInstr(MI);
1256
1257   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1258
1259   const ArgDescriptor *Arg;
1260   const TargetRegisterClass *RC;
1261   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1262   if (!Arg) {
1263     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1264     return false;
1265   }
1266
1267   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1268     MI.eraseFromParent();
1269     return true;
1270   }
1271
1272   return false;
1273 }
1274
1275 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1276                                                  MachineRegisterInfo &MRI,
1277                                                  MachineIRBuilder &B) const {
1278   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1279   if (!MFI->isEntryFunction()) {
1280     return legalizePreloadedArgIntrin(MI, MRI, B,
1281                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1282   }
1283
1284   B.setInstr(MI);
1285
1286   uint64_t Offset =
1287     ST.getTargetLowering()->getImplicitParameterOffset(
1288       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1289   Register DstReg = MI.getOperand(0).getReg();
1290   LLT DstTy = MRI.getType(DstReg);
1291   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1292
1293   const ArgDescriptor *Arg;
1294   const TargetRegisterClass *RC;
1295   std::tie(Arg, RC)
1296     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1297   if (!Arg)
1298     return false;
1299
1300   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1301   if (!loadInputValue(KernargPtrReg, B, Arg))
1302     return false;
1303
1304   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1305   MI.eraseFromParent();
1306   return true;
1307 }
1308
1309 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1310                                             MachineRegisterInfo &MRI,
1311                                             MachineIRBuilder &B) const {
1312   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1313   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1314   case Intrinsic::amdgcn_if: {
1315     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1316       const SIRegisterInfo *TRI
1317         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1318
1319       B.setInstr(*BrCond);
1320       Register Def = MI.getOperand(1).getReg();
1321       Register Use = MI.getOperand(3).getReg();
1322       B.buildInstr(AMDGPU::SI_IF)
1323         .addDef(Def)
1324         .addUse(Use)
1325         .addMBB(BrCond->getOperand(1).getMBB());
1326
1327       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1328       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1329       MI.eraseFromParent();
1330       BrCond->eraseFromParent();
1331       return true;
1332     }
1333
1334     return false;
1335   }
1336   case Intrinsic::amdgcn_loop: {
1337     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1338       const SIRegisterInfo *TRI
1339         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1340
1341       B.setInstr(*BrCond);
1342       Register Reg = MI.getOperand(2).getReg();
1343       B.buildInstr(AMDGPU::SI_LOOP)
1344         .addUse(Reg)
1345         .addMBB(BrCond->getOperand(1).getMBB());
1346       MI.eraseFromParent();
1347       BrCond->eraseFromParent();
1348       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1349       return true;
1350     }
1351
1352     return false;
1353   }
1354   case Intrinsic::amdgcn_kernarg_segment_ptr:
1355     return legalizePreloadedArgIntrin(
1356       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1357   case Intrinsic::amdgcn_implicitarg_ptr:
1358     return legalizeImplicitArgPtr(MI, MRI, B);
1359   case Intrinsic::amdgcn_workitem_id_x:
1360     return legalizePreloadedArgIntrin(MI, MRI, B,
1361                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1362   case Intrinsic::amdgcn_workitem_id_y:
1363     return legalizePreloadedArgIntrin(MI, MRI, B,
1364                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1365   case Intrinsic::amdgcn_workitem_id_z:
1366     return legalizePreloadedArgIntrin(MI, MRI, B,
1367                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1368   case Intrinsic::amdgcn_workgroup_id_x:
1369     return legalizePreloadedArgIntrin(MI, MRI, B,
1370                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1371   case Intrinsic::amdgcn_workgroup_id_y:
1372     return legalizePreloadedArgIntrin(MI, MRI, B,
1373                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1374   case Intrinsic::amdgcn_workgroup_id_z:
1375     return legalizePreloadedArgIntrin(MI, MRI, B,
1376                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1377   case Intrinsic::amdgcn_dispatch_ptr:
1378     return legalizePreloadedArgIntrin(MI, MRI, B,
1379                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1380   case Intrinsic::amdgcn_queue_ptr:
1381     return legalizePreloadedArgIntrin(MI, MRI, B,
1382                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1383   case Intrinsic::amdgcn_implicit_buffer_ptr:
1384     return legalizePreloadedArgIntrin(
1385       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1386   case Intrinsic::amdgcn_dispatch_id:
1387     return legalizePreloadedArgIntrin(MI, MRI, B,
1388                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1389   default:
1390     return true;
1391   }
1392
1393   return true;
1394 }