contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

   1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the Machinelegalizer class for
  10 /// AMDGPU.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPULegalizerInfo.h"
  15
  16 #include "AMDGPU.h"
  17 #include "AMDGPUGlobalISelUtils.h"
  18 #include "AMDGPUTargetMachine.h"
  19 #include "SIMachineFunctionInfo.h"
  20 #include "llvm/ADT/ScopeExit.h"
  21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
  22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
  24 #include "llvm/CodeGen/TargetOpcodes.h"
  25 #include "llvm/CodeGen/ValueTypes.h"
  26 #include "llvm/IR/DerivedTypes.h"
  27 #include "llvm/IR/DiagnosticInfo.h"
  28 #include "llvm/IR/Type.h"
  29 #include "llvm/Support/Debug.h"
  30
  31 #define DEBUG_TYPE "amdgpu-legalinfo"
  32
  33 using namespace llvm;
  34 using namespace LegalizeActions;
  35 using namespace LegalizeMutations;
  36 using namespace LegalityPredicates;
  37 using namespace MIPatternMatch;
  38
  39 // Hack until load/store selection patterns support any tuple of legal types.
  40 static cl::opt<bool> EnableNewLegality(
  41   "amdgpu-global-isel-new-legality",
  42   cl::desc("Use GlobalISel desired legality, rather than try to use"
  43            "rules compatible with selection patterns"),
  44   cl::init(false),
  45   cl::ReallyHidden);
  46
  47 static constexpr unsigned MaxRegisterSize = 1024;
  48
  49 // Round the number of elements to the next power of two elements
  50 static LLT getPow2VectorType(LLT Ty) {
  51   unsigned NElts = Ty.getNumElements();
  52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
  53   return Ty.changeNumElements(Pow2NElts);
  54 }
  55
  56 // Round the number of bits to the next power of two bits
  57 static LLT getPow2ScalarType(LLT Ty) {
  58   unsigned Bits = Ty.getSizeInBits();
  59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
  60   return LLT::scalar(Pow2Bits);
  61 }
  62
  63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
  64   return [=](const LegalityQuery &Query) {
  65     const LLT Ty = Query.Types[TypeIdx];
  66     return Ty.isVector() &&
  67            Ty.getNumElements() % 2 != 0 &&
  68            Ty.getElementType().getSizeInBits() < 32 &&
  69            Ty.getSizeInBits() % 32 != 0;
  70   };
  71 }
  72
  73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
  74   return [=](const LegalityQuery &Query) {
  75     const LLT Ty = Query.Types[TypeIdx];
  76     const LLT EltTy = Ty.getScalarType();
  77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
  78   };
  79 }
  80
  81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
  82   return [=](const LegalityQuery &Query) {
  83     const LLT Ty = Query.Types[TypeIdx];
  84     const LLT EltTy = Ty.getElementType();
  85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
  86   };
  87 }
  88
  89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
  90   return [=](const LegalityQuery &Query) {
  91     const LLT Ty = Query.Types[TypeIdx];
  92     const LLT EltTy = Ty.getElementType();
  93     unsigned Size = Ty.getSizeInBits();
  94     unsigned Pieces = (Size + 63) / 64;
  95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
  96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
  97   };
  98 }
  99
 100 // Increase the number of vector elements to reach the next multiple of 32-bit
 101 // type.
 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
 103   return [=](const LegalityQuery &Query) {
 104     const LLT Ty = Query.Types[TypeIdx];
 105
 106     const LLT EltTy = Ty.getElementType();
 107     const int Size = Ty.getSizeInBits();
 108     const int EltSize = EltTy.getSizeInBits();
 109     const int NextMul32 = (Size + 31) / 32;
 110
 111     assert(EltSize < 32);
 112
 113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
 114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
 115   };
 116 }
 117
 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
 119   return [=](const LegalityQuery &Query) {
 120     const LLT Ty = Query.Types[TypeIdx];
 121     unsigned Size = Ty.getSizeInBits();
 122
 123     LLT CoercedTy;
 124     if (Size <= 32) {
 125       // <2 x s8> -> s16
 126       // <4 x s8> -> s32
 127       CoercedTy = LLT::scalar(Size);
 128     } else
 129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
 130
 131     return std::make_pair(TypeIdx, CoercedTy);
 132   };
 133 }
 134
 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
 136   return [=](const LegalityQuery &Query) {
 137     const LLT QueryTy = Query.Types[TypeIdx];
 138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
 139   };
 140 }
 141
 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
 143   return [=](const LegalityQuery &Query) {
 144     const LLT QueryTy = Query.Types[TypeIdx];
 145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
 146   };
 147 }
 148
 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
 150   return [=](const LegalityQuery &Query) {
 151     const LLT QueryTy = Query.Types[TypeIdx];
 152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
 153   };
 154 }
 155
 156 static bool isRegisterSize(unsigned Size) {
 157   return Size % 32 == 0 && Size <= MaxRegisterSize;
 158 }
 159
 160 static bool isRegisterVectorElementType(LLT EltTy) {
 161   const int EltSize = EltTy.getSizeInBits();
 162   return EltSize == 16 || EltSize % 32 == 0;
 163 }
 164
 165 static bool isRegisterVectorType(LLT Ty) {
 166   const int EltSize = Ty.getElementType().getSizeInBits();
 167   return EltSize == 32 || EltSize == 64 ||
 168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
 169          EltSize == 128 || EltSize == 256;
 170 }
 171
 172 static bool isRegisterType(LLT Ty) {
 173   if (!isRegisterSize(Ty.getSizeInBits()))
 174     return false;
 175
 176   if (Ty.isVector())
 177     return isRegisterVectorType(Ty);
 178
 179   return true;
 180 }
 181
 182 // Any combination of 32 or 64-bit elements up the maximum register size, and
 183 // multiples of v2s16.
 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
 185   return [=](const LegalityQuery &Query) {
 186     return isRegisterType(Query.Types[TypeIdx]);
 187   };
 188 }
 189
 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
 191   return [=](const LegalityQuery &Query) {
 192     const LLT QueryTy = Query.Types[TypeIdx];
 193     if (!QueryTy.isVector())
 194       return false;
 195     const LLT EltTy = QueryTy.getElementType();
 196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
 197   };
 198 }
 199
 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
 201   return [=](const LegalityQuery &Query) {
 202     const LLT Ty = Query.Types[TypeIdx];
 203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
 204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
 205   };
 206 }
 207
 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
 209 // handle some operations by just promoting the register during
 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
 212                                     bool IsLoad) {
 213   switch (AS) {
 214   case AMDGPUAS::PRIVATE_ADDRESS:
 215     // FIXME: Private element size.
 216     return 32;
 217   case AMDGPUAS::LOCAL_ADDRESS:
 218     return ST.useDS128() ? 128 : 64;
 219   case AMDGPUAS::GLOBAL_ADDRESS:
 220   case AMDGPUAS::CONSTANT_ADDRESS:
 221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
 222     // Treat constant and global as identical. SMRD loads are sometimes usable for
 223     // global loads (ideally constant address space should be eliminated)
 224     // depending on the context. Legality cannot be context dependent, but
 225     // RegBankSelect can split the load as necessary depending on the pointer
 226     // register bank/uniformity and if the memory is invariant or not written in a
 227     // kernel.
 228     return IsLoad ? 512 : 128;
 229   default:
 230     // Flat addresses may contextually need to be split to 32-bit parts if they
 231     // may alias scratch depending on the subtarget.
 232     return 128;
 233   }
 234 }
 235
 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
 237                                  const LegalityQuery &Query,
 238                                  unsigned Opcode) {
 239   const LLT Ty = Query.Types[0];
 240
 241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
 242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
 243
 244   unsigned RegSize = Ty.getSizeInBits();
 245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 246   unsigned Align = Query.MMODescrs[0].AlignInBits;
 247   unsigned AS = Query.Types[1].getAddressSpace();
 248
 249   // All of these need to be custom lowered to cast the pointer operand.
 250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
 251     return false;
 252
 253   // TODO: We should be able to widen loads if the alignment is high enough, but
 254   // we also need to modify the memory access size.
 255 #if 0
 256   // Accept widening loads based on alignment.
 257   if (IsLoad && MemSize < Size)
 258     MemSize = std::max(MemSize, Align);
 259 #endif
 260
 261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
 262   if (MemSize != RegSize && RegSize != 32)
 263     return false;
 264
 265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
 266     return false;
 267
 268   switch (MemSize) {
 269   case 8:
 270   case 16:
 271   case 32:
 272   case 64:
 273   case 128:
 274     break;
 275   case 96:
 276     if (!ST.hasDwordx3LoadStores())
 277       return false;
 278     break;
 279   case 256:
 280   case 512:
 281     // These may contextually need to be broken down.
 282     break;
 283   default:
 284     return false;
 285   }
 286
 287   assert(RegSize >= MemSize);
 288
 289   if (Align < MemSize) {
 290     const SITargetLowering *TLI = ST.getTargetLowering();
 291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
 292       return false;
 293   }
 294
 295   return true;
 296 }
 297
 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
 299 // workaround this. Eventually it should ignore the type for loads and only care
 300 // about the size. Return true in cases where we will workaround this for now by
 301 // bitcasting.
 302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
 303   if (EnableNewLegality)
 304     return false;
 305
 306   const unsigned Size = Ty.getSizeInBits();
 307   if (Size <= 64)
 308     return false;
 309   if (!Ty.isVector())
 310     return true;
 311   unsigned EltSize = Ty.getElementType().getSizeInBits();
 312   return EltSize != 32 && EltSize != 64;
 313 }
 314
 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
 316                              unsigned Opcode) {
 317   const LLT Ty = Query.Types[0];
 318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
 319          !loadStoreBitcastWorkaround(Ty);
 320 }
 321
 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 323                                          const GCNTargetMachine &TM)
 324   :  ST(ST_) {
 325   using namespace TargetOpcode;
 326
 327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
 328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
 329   };
 330
 331   const LLT S1 = LLT::scalar(1);
 332   const LLT S16 = LLT::scalar(16);
 333   const LLT S32 = LLT::scalar(32);
 334   const LLT S64 = LLT::scalar(64);
 335   const LLT S128 = LLT::scalar(128);
 336   const LLT S256 = LLT::scalar(256);
 337   const LLT S512 = LLT::scalar(512);
 338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
 339
 340   const LLT V2S16 = LLT::vector(2, 16);
 341   const LLT V4S16 = LLT::vector(4, 16);
 342
 343   const LLT V2S32 = LLT::vector(2, 32);
 344   const LLT V3S32 = LLT::vector(3, 32);
 345   const LLT V4S32 = LLT::vector(4, 32);
 346   const LLT V5S32 = LLT::vector(5, 32);
 347   const LLT V6S32 = LLT::vector(6, 32);
 348   const LLT V7S32 = LLT::vector(7, 32);
 349   const LLT V8S32 = LLT::vector(8, 32);
 350   const LLT V9S32 = LLT::vector(9, 32);
 351   const LLT V10S32 = LLT::vector(10, 32);
 352   const LLT V11S32 = LLT::vector(11, 32);
 353   const LLT V12S32 = LLT::vector(12, 32);
 354   const LLT V13S32 = LLT::vector(13, 32);
 355   const LLT V14S32 = LLT::vector(14, 32);
 356   const LLT V15S32 = LLT::vector(15, 32);
 357   const LLT V16S32 = LLT::vector(16, 32);
 358   const LLT V32S32 = LLT::vector(32, 32);
 359
 360   const LLT V2S64 = LLT::vector(2, 64);
 361   const LLT V3S64 = LLT::vector(3, 64);
 362   const LLT V4S64 = LLT::vector(4, 64);
 363   const LLT V5S64 = LLT::vector(5, 64);
 364   const LLT V6S64 = LLT::vector(6, 64);
 365   const LLT V7S64 = LLT::vector(7, 64);
 366   const LLT V8S64 = LLT::vector(8, 64);
 367   const LLT V16S64 = LLT::vector(16, 64);
 368
 369   std::initializer_list<LLT> AllS32Vectors =
 370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
 371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
 372   std::initializer_list<LLT> AllS64Vectors =
 373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
 374
 375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
 376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
 377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
 378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
 379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
 380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
 381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
 382
 383   const LLT CodePtr = FlatPtr;
 384
 385   const std::initializer_list<LLT> AddrSpaces64 = {
 386     GlobalPtr, ConstantPtr, FlatPtr
 387   };
 388
 389   const std::initializer_list<LLT> AddrSpaces32 = {
 390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
 391   };
 392
 393   const std::initializer_list<LLT> FPTypesBase = {
 394     S32, S64
 395   };
 396
 397   const std::initializer_list<LLT> FPTypes16 = {
 398     S32, S64, S16
 399   };
 400
 401   const std::initializer_list<LLT> FPTypesPK16 = {
 402     S32, S64, S16, V2S16
 403   };
 404
 405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
 406
 407   setAction({G_BRCOND, S1}, Legal); // VCC branches
 408   setAction({G_BRCOND, S32}, Legal); // SCC branches
 409
 410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
 411   // elements for v3s16
 412   getActionDefinitionsBuilder(G_PHI)
 413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
 414     .legalFor(AllS32Vectors)
 415     .legalFor(AllS64Vectors)
 416     .legalFor(AddrSpaces64)
 417     .legalFor(AddrSpaces32)
 418     .clampScalar(0, S32, S256)
 419     .widenScalarToNextPow2(0, 32)
 420     .clampMaxNumElements(0, S32, 16)
 421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 422     .legalIf(isPointer(0));
 423
 424   if (ST.hasVOP3PInsts()) {
 425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 426       .legalFor({S32, S16, V2S16})
 427       .clampScalar(0, S16, S32)
 428       .clampMaxNumElements(0, S16, 2)
 429       .scalarize(0)
 430       .widenScalarToNextPow2(0, 32);
 431   } else if (ST.has16BitInsts()) {
 432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 433       .legalFor({S32, S16})
 434       .clampScalar(0, S16, S32)
 435       .scalarize(0)
 436       .widenScalarToNextPow2(0, 32);
 437   } else {
 438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
 439       .legalFor({S32})
 440       .clampScalar(0, S32, S32)
 441       .scalarize(0);
 442   }
 443
 444   // FIXME: Not really legal. Placeholder for custom lowering.
 445   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
 446     .customFor({S32, S64})
 447     .clampScalar(0, S32, S64)
 448     .widenScalarToNextPow2(0, 32)
 449     .scalarize(0);
 450
 451   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
 452     .legalFor({S32})
 453     .clampScalar(0, S32, S32)
 454     .scalarize(0);
 455
 456   // Report legal for any types we can handle anywhere. For the cases only legal
 457   // on the SALU, RegBankSelect will be able to re-legalize.
 458   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
 459     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
 460     .clampScalar(0, S32, S64)
 461     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 462     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
 463     .widenScalarToNextPow2(0)
 464     .scalarize(0);
 465
 466   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
 467                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
 468     .legalFor({{S32, S1}, {S32, S32}})
 469     .minScalar(0, S32)
 470     // TODO: .scalarize(0)
 471     .lower();
 472
 473   getActionDefinitionsBuilder(G_BITCAST)
 474     // Don't worry about the size constraint.
 475     .legalIf(all(isRegisterType(0), isRegisterType(1)))
 476     .lower();
 477
 478
 479   getActionDefinitionsBuilder(G_CONSTANT)
 480     .legalFor({S1, S32, S64, S16, GlobalPtr,
 481                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
 482     .clampScalar(0, S32, S64)
 483     .widenScalarToNextPow2(0)
 484     .legalIf(isPointer(0));
 485
 486   getActionDefinitionsBuilder(G_FCONSTANT)
 487     .legalFor({S32, S64, S16})
 488     .clampScalar(0, S16, S64);
 489
 490   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
 491       .legalIf(isRegisterType(0))
 492       // s1 and s16 are special cases because they have legal operations on
 493       // them, but don't really occupy registers in the normal way.
 494       .legalFor({S1, S16})
 495       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 496       .clampScalarOrElt(0, S32, MaxScalar)
 497       .widenScalarToNextPow2(0, 32)
 498       .clampMaxNumElements(0, S32, 16);
 499
 500   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
 501
 502   // If the amount is divergent, we have to do a wave reduction to get the
 503   // maximum value, so this is expanded during RegBankSelect.
 504   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
 505     .legalFor({{PrivatePtr, S32}});
 506
 507   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
 508     .unsupportedFor({PrivatePtr})
 509     .custom();
 510   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 511
 512   auto &FPOpActions = getActionDefinitionsBuilder(
 513     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
 514     .legalFor({S32, S64});
 515   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
 516     .customFor({S32, S64});
 517   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
 518     .customFor({S32, S64});
 519
 520   if (ST.has16BitInsts()) {
 521     if (ST.hasVOP3PInsts())
 522       FPOpActions.legalFor({S16, V2S16});
 523     else
 524       FPOpActions.legalFor({S16});
 525
 526     TrigActions.customFor({S16});
 527     FDIVActions.customFor({S16});
 528   }
 529
 530   auto &MinNumMaxNum = getActionDefinitionsBuilder({
 531       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
 532
 533   if (ST.hasVOP3PInsts()) {
 534     MinNumMaxNum.customFor(FPTypesPK16)
 535       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 536       .clampMaxNumElements(0, S16, 2)
 537       .clampScalar(0, S16, S64)
 538       .scalarize(0);
 539   } else if (ST.has16BitInsts()) {
 540     MinNumMaxNum.customFor(FPTypes16)
 541       .clampScalar(0, S16, S64)
 542       .scalarize(0);
 543   } else {
 544     MinNumMaxNum.customFor(FPTypesBase)
 545       .clampScalar(0, S32, S64)
 546       .scalarize(0);
 547   }
 548
 549   if (ST.hasVOP3PInsts())
 550     FPOpActions.clampMaxNumElements(0, S16, 2);
 551
 552   FPOpActions
 553     .scalarize(0)
 554     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 555
 556   TrigActions
 557     .scalarize(0)
 558     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 559
 560   FDIVActions
 561     .scalarize(0)
 562     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
 563
 564   getActionDefinitionsBuilder({G_FNEG, G_FABS})
 565     .legalFor(FPTypesPK16)
 566     .clampMaxNumElements(0, S16, 2)
 567     .scalarize(0)
 568     .clampScalar(0, S16, S64);
 569
 570   if (ST.has16BitInsts()) {
 571     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
 572       .legalFor({S32, S64, S16})
 573       .scalarize(0)
 574       .clampScalar(0, S16, S64);
 575   } else {
 576     getActionDefinitionsBuilder(G_FSQRT)
 577       .legalFor({S32, S64})
 578       .scalarize(0)
 579       .clampScalar(0, S32, S64);
 580
 581     if (ST.hasFractBug()) {
 582       getActionDefinitionsBuilder(G_FFLOOR)
 583         .customFor({S64})
 584         .legalFor({S32, S64})
 585         .scalarize(0)
 586         .clampScalar(0, S32, S64);
 587     } else {
 588       getActionDefinitionsBuilder(G_FFLOOR)
 589         .legalFor({S32, S64})
 590         .scalarize(0)
 591         .clampScalar(0, S32, S64);
 592     }
 593   }
 594
 595   getActionDefinitionsBuilder(G_FPTRUNC)
 596     .legalFor({{S32, S64}, {S16, S32}})
 597     .scalarize(0)
 598     .lower();
 599
 600   getActionDefinitionsBuilder(G_FPEXT)
 601     .legalFor({{S64, S32}, {S32, S16}})
 602     .lowerFor({{S64, S16}}) // FIXME: Implement
 603     .scalarize(0);
 604
 605   getActionDefinitionsBuilder(G_FSUB)
 606       // Use actual fsub instruction
 607       .legalFor({S32})
 608       // Must use fadd + fneg
 609       .lowerFor({S64, S16, V2S16})
 610       .scalarize(0)
 611       .clampScalar(0, S32, S64);
 612
 613   // Whether this is legal depends on the floating point mode for the function.
 614   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
 615   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
 616     FMad.customFor({S32, S16});
 617   else if (ST.hasMadMacF32Insts())
 618     FMad.customFor({S32});
 619   else if (ST.hasMadF16())
 620     FMad.customFor({S16});
 621   FMad.scalarize(0)
 622       .lower();
 623
 624   // TODO: Do we need to clamp maximum bitwidth?
 625   getActionDefinitionsBuilder(G_TRUNC)
 626     .legalIf(isScalar(0))
 627     .legalFor({{V2S16, V2S32}})
 628     .clampMaxNumElements(0, S16, 2)
 629     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
 630     // situations (like an invalid implicit use), we don't want to infinite loop
 631     // in the legalizer.
 632     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
 633     .alwaysLegal();
 634
 635   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
 636     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
 637                {S32, S1}, {S64, S1}, {S16, S1}})
 638     .scalarize(0)
 639     .clampScalar(0, S32, S64)
 640     .widenScalarToNextPow2(1, 32);
 641
 642   // TODO: Split s1->s64 during regbankselect for VALU.
 643   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
 644     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
 645     .lowerFor({{S32, S64}})
 646     .lowerIf(typeIs(1, S1))
 647     .customFor({{S64, S64}});
 648   if (ST.has16BitInsts())
 649     IToFP.legalFor({{S16, S16}});
 650   IToFP.clampScalar(1, S32, S64)
 651        .scalarize(0)
 652        .widenScalarToNextPow2(1);
 653
 654   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
 655     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
 656     .customFor({{S64, S64}});
 657   if (ST.has16BitInsts())
 658     FPToI.legalFor({{S16, S16}});
 659   else
 660     FPToI.minScalar(1, S32);
 661
 662   FPToI.minScalar(0, S32)
 663        .scalarize(0)
 664        .lower();
 665
 666   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
 667     .scalarize(0)
 668     .lower();
 669
 670   if (ST.has16BitInsts()) {
 671     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 672       .legalFor({S16, S32, S64})
 673       .clampScalar(0, S16, S64)
 674       .scalarize(0);
 675   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
 676     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 677       .legalFor({S32, S64})
 678       .clampScalar(0, S32, S64)
 679       .scalarize(0);
 680   } else {
 681     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
 682       .legalFor({S32})
 683       .customFor({S64})
 684       .clampScalar(0, S32, S64)
 685       .scalarize(0);
 686   }
 687
 688   // FIXME: Clamp offset operand.
 689   getActionDefinitionsBuilder(G_PTR_ADD)
 690     .legalIf(isPointer(0))
 691     .scalarize(0);
 692
 693   getActionDefinitionsBuilder(G_PTRMASK)
 694     .legalIf(typeInSet(1, {S64, S32}))
 695     .minScalar(1, S32)
 696     .maxScalarIf(sizeIs(0, 32), 1, S32)
 697     .maxScalarIf(sizeIs(0, 64), 1, S64)
 698     .scalarize(0);
 699
 700   auto &CmpBuilder =
 701     getActionDefinitionsBuilder(G_ICMP)
 702     // The compare output type differs based on the register bank of the output,
 703     // so make both s1 and s32 legal.
 704     //
 705     // Scalar compares producing output in scc will be promoted to s32, as that
 706     // is the allocatable register type that will be needed for the copy from
 707     // scc. This will be promoted during RegBankSelect, and we assume something
 708     // before that won't try to use s32 result types.
 709     //
 710     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
 711     // bank.
 712     .legalForCartesianProduct(
 713       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
 714     .legalForCartesianProduct(
 715       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
 716   if (ST.has16BitInsts()) {
 717     CmpBuilder.legalFor({{S1, S16}});
 718   }
 719
 720   CmpBuilder
 721     .widenScalarToNextPow2(1)
 722     .clampScalar(1, S32, S64)
 723     .scalarize(0)
 724     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
 725
 726   getActionDefinitionsBuilder(G_FCMP)
 727     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
 728     .widenScalarToNextPow2(1)
 729     .clampScalar(1, S32, S64)
 730     .scalarize(0);
 731
 732   // FIXME: fpow has a selection pattern that should move to custom lowering.
 733   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
 734   if (ST.has16BitInsts())
 735     Exp2Ops.legalFor({S32, S16});
 736   else
 737     Exp2Ops.legalFor({S32});
 738   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
 739   Exp2Ops.scalarize(0);
 740
 741   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
 742   if (ST.has16BitInsts())
 743     ExpOps.customFor({{S32}, {S16}});
 744   else
 745     ExpOps.customFor({S32});
 746   ExpOps.clampScalar(0, MinScalarFPTy, S32)
 747         .scalarize(0);
 748
 749   // The 64-bit versions produce 32-bit results, but only on the SALU.
 750   getActionDefinitionsBuilder(G_CTPOP)
 751     .legalFor({{S32, S32}, {S32, S64}})
 752     .clampScalar(0, S32, S32)
 753     .clampScalar(1, S32, S64)
 754     .scalarize(0)
 755     .widenScalarToNextPow2(0, 32)
 756     .widenScalarToNextPow2(1, 32);
 757
 758   // The hardware instructions return a different result on 0 than the generic
 759   // instructions expect. The hardware produces -1, but these produce the
 760   // bitwidth.
 761   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
 762     .scalarize(0)
 763     .clampScalar(0, S32, S32)
 764     .clampScalar(1, S32, S64)
 765     .widenScalarToNextPow2(0, 32)
 766     .widenScalarToNextPow2(1, 32)
 767     .lower();
 768
 769   // The 64-bit versions produce 32-bit results, but only on the SALU.
 770   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
 771     .legalFor({{S32, S32}, {S32, S64}})
 772     .clampScalar(0, S32, S32)
 773     .clampScalar(1, S32, S64)
 774     .scalarize(0)
 775     .widenScalarToNextPow2(0, 32)
 776     .widenScalarToNextPow2(1, 32);
 777
 778   getActionDefinitionsBuilder(G_BITREVERSE)
 779     .legalFor({S32})
 780     .clampScalar(0, S32, S32)
 781     .scalarize(0);
 782
 783   if (ST.has16BitInsts()) {
 784     getActionDefinitionsBuilder(G_BSWAP)
 785       .legalFor({S16, S32, V2S16})
 786       .clampMaxNumElements(0, S16, 2)
 787       // FIXME: Fixing non-power-of-2 before clamp is workaround for
 788       // narrowScalar limitation.
 789       .widenScalarToNextPow2(0)
 790       .clampScalar(0, S16, S32)
 791       .scalarize(0);
 792
 793     if (ST.hasVOP3PInsts()) {
 794       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 795         .legalFor({S32, S16, V2S16})
 796         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
 797         .clampMaxNumElements(0, S16, 2)
 798         .minScalar(0, S16)
 799         .widenScalarToNextPow2(0)
 800         .scalarize(0)
 801         .lower();
 802     } else {
 803       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 804         .legalFor({S32, S16})
 805         .widenScalarToNextPow2(0)
 806         .minScalar(0, S16)
 807         .scalarize(0)
 808         .lower();
 809     }
 810   } else {
 811     // TODO: Should have same legality without v_perm_b32
 812     getActionDefinitionsBuilder(G_BSWAP)
 813       .legalFor({S32})
 814       .lowerIf(scalarNarrowerThan(0, 32))
 815       // FIXME: Fixing non-power-of-2 before clamp is workaround for
 816       // narrowScalar limitation.
 817       .widenScalarToNextPow2(0)
 818       .maxScalar(0, S32)
 819       .scalarize(0)
 820       .lower();
 821
 822     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
 823       .legalFor({S32})
 824       .minScalar(0, S32)
 825       .widenScalarToNextPow2(0)
 826       .scalarize(0)
 827       .lower();
 828   }
 829
 830   getActionDefinitionsBuilder(G_INTTOPTR)
 831     // List the common cases
 832     .legalForCartesianProduct(AddrSpaces64, {S64})
 833     .legalForCartesianProduct(AddrSpaces32, {S32})
 834     .scalarize(0)
 835     // Accept any address space as long as the size matches
 836     .legalIf(sameSize(0, 1))
 837     .widenScalarIf(smallerThan(1, 0),
 838       [](const LegalityQuery &Query) {
 839         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 840       })
 841     .narrowScalarIf(largerThan(1, 0),
 842       [](const LegalityQuery &Query) {
 843         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
 844       });
 845
 846   getActionDefinitionsBuilder(G_PTRTOINT)
 847     // List the common cases
 848     .legalForCartesianProduct(AddrSpaces64, {S64})
 849     .legalForCartesianProduct(AddrSpaces32, {S32})
 850     .scalarize(0)
 851     // Accept any address space as long as the size matches
 852     .legalIf(sameSize(0, 1))
 853     .widenScalarIf(smallerThan(0, 1),
 854       [](const LegalityQuery &Query) {
 855         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 856       })
 857     .narrowScalarIf(
 858       largerThan(0, 1),
 859       [](const LegalityQuery &Query) {
 860         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
 861       });
 862
 863   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
 864     .scalarize(0)
 865     .custom();
 866
 867   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
 868                                     bool IsLoad) -> bool {
 869     const LLT DstTy = Query.Types[0];
 870
 871     // Split vector extloads.
 872     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
 873     unsigned Align = Query.MMODescrs[0].AlignInBits;
 874
 875     if (MemSize < DstTy.getSizeInBits())
 876       MemSize = std::max(MemSize, Align);
 877
 878     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
 879       return true;
 880
 881     const LLT PtrTy = Query.Types[1];
 882     unsigned AS = PtrTy.getAddressSpace();
 883     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
 884       return true;
 885
 886     // Catch weird sized loads that don't evenly divide into the access sizes
 887     // TODO: May be able to widen depending on alignment etc.
 888     unsigned NumRegs = (MemSize + 31) / 32;
 889     if (NumRegs == 3) {
 890       if (!ST.hasDwordx3LoadStores())
 891         return true;
 892     } else {
 893       // If the alignment allows, these should have been widened.
 894       if (!isPowerOf2_32(NumRegs))
 895         return true;
 896     }
 897
 898     if (Align < MemSize) {
 899       const SITargetLowering *TLI = ST.getTargetLowering();
 900       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
 901     }
 902
 903     return false;
 904   };
 905
 906   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
 907                                          unsigned Opc) -> bool {
 908     unsigned Size = Query.Types[0].getSizeInBits();
 909     if (isPowerOf2_32(Size))
 910       return false;
 911
 912     if (Size == 96 && ST.hasDwordx3LoadStores())
 913       return false;
 914
 915     unsigned AddrSpace = Query.Types[1].getAddressSpace();
 916     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
 917       return false;
 918
 919     unsigned Align = Query.MMODescrs[0].AlignInBits;
 920     unsigned RoundedSize = NextPowerOf2(Size);
 921     return (Align >= RoundedSize);
 922   };
 923
 924   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
 925   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
 926   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
 927
 928   // TODO: Refine based on subtargets which support unaligned access or 128-bit
 929   // LDS
 930   // TODO: Unsupported flat for SI.
 931
 932   for (unsigned Op : {G_LOAD, G_STORE}) {
 933     const bool IsStore = Op == G_STORE;
 934
 935     auto &Actions = getActionDefinitionsBuilder(Op);
 936     // Explicitly list some common cases.
 937     // TODO: Does this help compile time at all?
 938     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
 939                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
 940                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
 941                                       {S64, GlobalPtr, 64, GlobalAlign32},
 942                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
 943                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
 944                                       {S32, GlobalPtr, 8, GlobalAlign8},
 945                                       {S32, GlobalPtr, 16, GlobalAlign16},
 946
 947                                       {S32, LocalPtr, 32, 32},
 948                                       {S64, LocalPtr, 64, 32},
 949                                       {V2S32, LocalPtr, 64, 32},
 950                                       {S32, LocalPtr, 8, 8},
 951                                       {S32, LocalPtr, 16, 16},
 952                                       {V2S16, LocalPtr, 32, 32},
 953
 954                                       {S32, PrivatePtr, 32, 32},
 955                                       {S32, PrivatePtr, 8, 8},
 956                                       {S32, PrivatePtr, 16, 16},
 957                                       {V2S16, PrivatePtr, 32, 32},
 958
 959                                       {S32, ConstantPtr, 32, GlobalAlign32},
 960                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
 961                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
 962                                       {S64, ConstantPtr, 64, GlobalAlign32},
 963                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
 964     Actions.legalIf(
 965       [=](const LegalityQuery &Query) -> bool {
 966         return isLoadStoreLegal(ST, Query, Op);
 967       });
 968
 969     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
 970     // 64-bits.
 971     //
 972     // TODO: Should generalize bitcast action into coerce, which will also cover
 973     // inserting addrspacecasts.
 974     Actions.customIf(typeIs(1, Constant32Ptr));
 975
 976     // Turn any illegal element vectors into something easier to deal
 977     // with. These will ultimately produce 32-bit scalar shifts to extract the
 978     // parts anyway.
 979     //
 980     // For odd 16-bit element vectors, prefer to split those into pieces with
 981     // 16-bit vector parts.
 982     Actions.bitcastIf(
 983       [=](const LegalityQuery &Query) -> bool {
 984         const LLT Ty = Query.Types[0];
 985         const unsigned Size = Ty.getSizeInBits();
 986
 987         if (Size != Query.MMODescrs[0].SizeInBits)
 988           return Size <= 32 && Ty.isVector();
 989
 990         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
 991           return true;
 992         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
 993                !isRegisterVectorElementType(Ty.getElementType());
 994       }, bitcastToRegisterType(0));
 995
 996     Actions
 997         .customIf(typeIs(1, Constant32Ptr))
 998         // Widen suitably aligned loads by loading extra elements.
 999         .moreElementsIf([=](const LegalityQuery &Query) {
1000             const LLT Ty = Query.Types[0];
1001             return Op == G_LOAD && Ty.isVector() &&
1002                    shouldWidenLoadResult(Query, Op);
1003           }, moreElementsToNextPow2(0))
1004         .widenScalarIf([=](const LegalityQuery &Query) {
1005             const LLT Ty = Query.Types[0];
1006             return Op == G_LOAD && !Ty.isVector() &&
1007                    shouldWidenLoadResult(Query, Op);
1008           }, widenScalarOrEltToNextPow2(0))
1009         .narrowScalarIf(
1010             [=](const LegalityQuery &Query) -> bool {
1011               return !Query.Types[0].isVector() &&
1012                      needToSplitMemOp(Query, Op == G_LOAD);
1013             },
1014             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1015               const LLT DstTy = Query.Types[0];
1016               const LLT PtrTy = Query.Types[1];
1017
1018               const unsigned DstSize = DstTy.getSizeInBits();
1019               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1020
1021               // Split extloads.
1022               if (DstSize > MemSize)
1023                 return std::make_pair(0, LLT::scalar(MemSize));
1024
1025               if (!isPowerOf2_32(DstSize)) {
1026                 // We're probably decomposing an odd sized store. Try to split
1027                 // to the widest type. TODO: Account for alignment. As-is it
1028                 // should be OK, since the new parts will be further legalized.
1029                 unsigned FloorSize = PowerOf2Floor(DstSize);
1030                 return std::make_pair(0, LLT::scalar(FloorSize));
1031               }
1032
1033               if (DstSize > 32 && (DstSize % 32 != 0)) {
1034                 // FIXME: Need a way to specify non-extload of larger size if
1035                 // suitably aligned.
1036                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1037               }
1038
1039               unsigned MaxSize = maxSizeForAddrSpace(ST,
1040                                                      PtrTy.getAddressSpace(),
1041                                                      Op == G_LOAD);
1042               if (MemSize > MaxSize)
1043                 return std::make_pair(0, LLT::scalar(MaxSize));
1044
1045               unsigned Align = Query.MMODescrs[0].AlignInBits;
1046               return std::make_pair(0, LLT::scalar(Align));
1047             })
1048         .fewerElementsIf(
1049             [=](const LegalityQuery &Query) -> bool {
1050               return Query.Types[0].isVector() &&
1051                      needToSplitMemOp(Query, Op == G_LOAD);
1052             },
1053             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1054               const LLT DstTy = Query.Types[0];
1055               const LLT PtrTy = Query.Types[1];
1056
1057               LLT EltTy = DstTy.getElementType();
1058               unsigned MaxSize = maxSizeForAddrSpace(ST,
1059                                                      PtrTy.getAddressSpace(),
1060                                                      Op == G_LOAD);
1061
1062               // FIXME: Handle widened to power of 2 results better. This ends
1063               // up scalarizing.
1064               // FIXME: 3 element stores scalarized on SI
1065
1066               // Split if it's too large for the address space.
1067               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1068                 unsigned NumElts = DstTy.getNumElements();
1069                 unsigned EltSize = EltTy.getSizeInBits();
1070
1071                 if (MaxSize % EltSize == 0) {
1072                   return std::make_pair(
1073                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1074                 }
1075
1076                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1077
1078                 // FIXME: Refine when odd breakdowns handled
1079                 // The scalars will need to be re-legalized.
1080                 if (NumPieces == 1 || NumPieces >= NumElts ||
1081                     NumElts % NumPieces != 0)
1082                   return std::make_pair(0, EltTy);
1083
1084                 return std::make_pair(0,
1085                                       LLT::vector(NumElts / NumPieces, EltTy));
1086               }
1087
1088               // FIXME: We could probably handle weird extending loads better.
1089               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1090               if (DstTy.getSizeInBits() > MemSize)
1091                 return std::make_pair(0, EltTy);
1092
1093               unsigned EltSize = EltTy.getSizeInBits();
1094               unsigned DstSize = DstTy.getSizeInBits();
1095               if (!isPowerOf2_32(DstSize)) {
1096                 // We're probably decomposing an odd sized store. Try to split
1097                 // to the widest type. TODO: Account for alignment. As-is it
1098                 // should be OK, since the new parts will be further legalized.
1099                 unsigned FloorSize = PowerOf2Floor(DstSize);
1100                 return std::make_pair(
1101                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1102               }
1103
1104               // Need to split because of alignment.
1105               unsigned Align = Query.MMODescrs[0].AlignInBits;
1106               if (EltSize > Align &&
1107                   (EltSize / Align < DstTy.getNumElements())) {
1108                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1109               }
1110
1111               // May need relegalization for the scalars.
1112               return std::make_pair(0, EltTy);
1113             })
1114         .minScalar(0, S32);
1115
1116     if (IsStore)
1117       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1118
1119     // TODO: Need a bitcast lower option?
1120     Actions
1121         .widenScalarToNextPow2(0)
1122         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1123   }
1124
1125   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1126                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1127                                                   {S32, GlobalPtr, 16, 2 * 8},
1128                                                   {S32, LocalPtr, 8, 8},
1129                                                   {S32, LocalPtr, 16, 16},
1130                                                   {S32, PrivatePtr, 8, 8},
1131                                                   {S32, PrivatePtr, 16, 16},
1132                                                   {S32, ConstantPtr, 8, 8},
1133                                                   {S32, ConstantPtr, 16, 2 * 8}});
1134   if (ST.hasFlatAddressSpace()) {
1135     ExtLoads.legalForTypesWithMemDesc(
1136         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1137   }
1138
1139   ExtLoads.clampScalar(0, S32, S32)
1140           .widenScalarToNextPow2(0)
1141           .unsupportedIfMemSizeNotPow2()
1142           .lower();
1143
1144   auto &Atomics = getActionDefinitionsBuilder(
1145     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1146      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1147      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1148      G_ATOMICRMW_UMIN})
1149     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1150                {S64, GlobalPtr}, {S64, LocalPtr}});
1151   if (ST.hasFlatAddressSpace()) {
1152     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1153   }
1154
1155   if (ST.hasLDSFPAtomics()) {
1156     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1157       .legalFor({{S32, LocalPtr}});
1158   }
1159
1160   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1161   // demarshalling
1162   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1163     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1164                 {S32, FlatPtr}, {S64, FlatPtr}})
1165     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1166                {S32, RegionPtr}, {S64, RegionPtr}});
1167   // TODO: Pointer types, any 32-bit or 64-bit vector
1168
1169   // Condition should be s32 for scalar, s1 for vector.
1170   getActionDefinitionsBuilder(G_SELECT)
1171     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1172           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1173           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1174     .clampScalar(0, S16, S64)
1175     .scalarize(1)
1176     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1177     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1178     .clampMaxNumElements(0, S32, 2)
1179     .clampMaxNumElements(0, LocalPtr, 2)
1180     .clampMaxNumElements(0, PrivatePtr, 2)
1181     .scalarize(0)
1182     .widenScalarToNextPow2(0)
1183     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1184
1185   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1186   // be more flexible with the shift amount type.
1187   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1188     .legalFor({{S32, S32}, {S64, S32}});
1189   if (ST.has16BitInsts()) {
1190     if (ST.hasVOP3PInsts()) {
1191       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1192             .clampMaxNumElements(0, S16, 2);
1193     } else
1194       Shifts.legalFor({{S16, S16}});
1195
1196     // TODO: Support 16-bit shift amounts for all types
1197     Shifts.widenScalarIf(
1198       [=](const LegalityQuery &Query) {
1199         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1200         // 32-bit amount.
1201         const LLT ValTy = Query.Types[0];
1202         const LLT AmountTy = Query.Types[1];
1203         return ValTy.getSizeInBits() <= 16 &&
1204                AmountTy.getSizeInBits() < 16;
1205       }, changeTo(1, S16));
1206     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1207     Shifts.clampScalar(1, S32, S32);
1208     Shifts.clampScalar(0, S16, S64);
1209     Shifts.widenScalarToNextPow2(0, 16);
1210   } else {
1211     // Make sure we legalize the shift amount type first, as the general
1212     // expansion for the shifted type will produce much worse code if it hasn't
1213     // been truncated already.
1214     Shifts.clampScalar(1, S32, S32);
1215     Shifts.clampScalar(0, S32, S64);
1216     Shifts.widenScalarToNextPow2(0, 32);
1217   }
1218   Shifts.scalarize(0);
1219
1220   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1221     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1222     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1223     unsigned IdxTypeIdx = 2;
1224
1225     getActionDefinitionsBuilder(Op)
1226       .customIf([=](const LegalityQuery &Query) {
1227           const LLT EltTy = Query.Types[EltTypeIdx];
1228           const LLT VecTy = Query.Types[VecTypeIdx];
1229           const LLT IdxTy = Query.Types[IdxTypeIdx];
1230           return (EltTy.getSizeInBits() == 16 ||
1231                   EltTy.getSizeInBits() % 32 == 0) &&
1232                  VecTy.getSizeInBits() % 32 == 0 &&
1233                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1234                  IdxTy.getSizeInBits() == 32;
1235         })
1236       .clampScalar(EltTypeIdx, S32, S64)
1237       .clampScalar(VecTypeIdx, S32, S64)
1238       .clampScalar(IdxTypeIdx, S32, S32);
1239   }
1240
1241   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1242     .unsupportedIf([=](const LegalityQuery &Query) {
1243         const LLT &EltTy = Query.Types[1].getElementType();
1244         return Query.Types[0] != EltTy;
1245       });
1246
1247   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1248     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1249     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1250
1251     // FIXME: Doesn't handle extract of illegal sizes.
1252     getActionDefinitionsBuilder(Op)
1253       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1254       // FIXME: Multiples of 16 should not be legal.
1255       .legalIf([=](const LegalityQuery &Query) {
1256           const LLT BigTy = Query.Types[BigTyIdx];
1257           const LLT LitTy = Query.Types[LitTyIdx];
1258           return (BigTy.getSizeInBits() % 32 == 0) &&
1259                  (LitTy.getSizeInBits() % 16 == 0);
1260         })
1261       .widenScalarIf(
1262         [=](const LegalityQuery &Query) {
1263           const LLT BigTy = Query.Types[BigTyIdx];
1264           return (BigTy.getScalarSizeInBits() < 16);
1265         },
1266         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1267       .widenScalarIf(
1268         [=](const LegalityQuery &Query) {
1269           const LLT LitTy = Query.Types[LitTyIdx];
1270           return (LitTy.getScalarSizeInBits() < 16);
1271         },
1272         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1273       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1274       .widenScalarToNextPow2(BigTyIdx, 32);
1275
1276   }
1277
1278   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1279     .legalForCartesianProduct(AllS32Vectors, {S32})
1280     .legalForCartesianProduct(AllS64Vectors, {S64})
1281     .clampNumElements(0, V16S32, V32S32)
1282     .clampNumElements(0, V2S64, V16S64)
1283     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1284
1285   if (ST.hasScalarPackInsts()) {
1286     BuildVector
1287       // FIXME: Should probably widen s1 vectors straight to s32
1288       .minScalarOrElt(0, S16)
1289       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1290       .minScalar(1, S32);
1291
1292     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1293       .legalFor({V2S16, S32})
1294       .lower();
1295     BuildVector.minScalarOrElt(0, S32);
1296   } else {
1297     BuildVector.customFor({V2S16, S16});
1298     BuildVector.minScalarOrElt(0, S32);
1299
1300     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1301       .customFor({V2S16, S32})
1302       .lower();
1303   }
1304
1305   BuildVector.legalIf(isRegisterType(0));
1306
1307   // FIXME: Clamp maximum size
1308   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1309     .legalIf(isRegisterType(0));
1310
1311   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1312   // pre-legalize.
1313   if (ST.hasVOP3PInsts()) {
1314     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1315       .customFor({V2S16, V2S16})
1316       .lower();
1317   } else
1318     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1319
1320   // Merge/Unmerge
1321   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1322     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1323     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1324
1325     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1326       const LLT Ty = Query.Types[TypeIdx];
1327       if (Ty.isVector()) {
1328         const LLT &EltTy = Ty.getElementType();
1329         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1330           return true;
1331         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1332           return true;
1333       }
1334       return false;
1335     };
1336
1337     auto &Builder = getActionDefinitionsBuilder(Op)
1338       .lowerFor({{S16, V2S16}})
1339       .lowerIf([=](const LegalityQuery &Query) {
1340           const LLT BigTy = Query.Types[BigTyIdx];
1341           return BigTy.getSizeInBits() == 32;
1342         })
1343       // Try to widen to s16 first for small types.
1344       // TODO: Only do this on targets with legal s16 shifts
1345       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1346       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1347       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1348       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1349                            elementTypeIs(1, S16)),
1350                        changeTo(1, V2S16))
1351       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1352       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1353       // valid.
1354       .clampScalar(LitTyIdx, S32, S512)
1355       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1356       // Break up vectors with weird elements into scalars
1357       .fewerElementsIf(
1358         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1359         scalarize(0))
1360       .fewerElementsIf(
1361         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1362         scalarize(1))
1363       .clampScalar(BigTyIdx, S32, MaxScalar);
1364
1365     if (Op == G_MERGE_VALUES) {
1366       Builder.widenScalarIf(
1367         // TODO: Use 16-bit shifts if legal for 8-bit values?
1368         [=](const LegalityQuery &Query) {
1369           const LLT Ty = Query.Types[LitTyIdx];
1370           return Ty.getSizeInBits() < 32;
1371         },
1372         changeTo(LitTyIdx, S32));
1373     }
1374
1375     Builder.widenScalarIf(
1376       [=](const LegalityQuery &Query) {
1377         const LLT Ty = Query.Types[BigTyIdx];
1378         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1379           Ty.getSizeInBits() % 16 != 0;
1380       },
1381       [=](const LegalityQuery &Query) {
1382         // Pick the next power of 2, or a multiple of 64 over 128.
1383         // Whichever is smaller.
1384         const LLT &Ty = Query.Types[BigTyIdx];
1385         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1386         if (NewSizeInBits >= 256) {
1387           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1388           if (RoundedTo < NewSizeInBits)
1389             NewSizeInBits = RoundedTo;
1390         }
1391         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1392       })
1393       .legalIf([=](const LegalityQuery &Query) {
1394           const LLT &BigTy = Query.Types[BigTyIdx];
1395           const LLT &LitTy = Query.Types[LitTyIdx];
1396
1397           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1398             return false;
1399           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1400             return false;
1401
1402           return BigTy.getSizeInBits() % 16 == 0 &&
1403                  LitTy.getSizeInBits() % 16 == 0 &&
1404                  BigTy.getSizeInBits() <= MaxRegisterSize;
1405         })
1406       // Any vectors left are the wrong size. Scalarize them.
1407       .scalarize(0)
1408       .scalarize(1);
1409   }
1410
1411   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1412   // RegBankSelect.
1413   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1414     .legalFor({{S32}, {S64}});
1415
1416   if (ST.hasVOP3PInsts()) {
1417     SextInReg.lowerFor({{V2S16}})
1418       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1419       // get more vector shift opportunities, since we'll get those when
1420       // expanded.
1421       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1422   } else if (ST.has16BitInsts()) {
1423     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1424   } else {
1425     // Prefer to promote to s32 before lowering if we don't have 16-bit
1426     // shifts. This avoid a lot of intermediate truncate and extend operations.
1427     SextInReg.lowerFor({{S32}, {S64}});
1428   }
1429
1430   // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
1431   // available, and is selectively legal for s16, s32, v2s16.
1432   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
1433     .scalarize(0)
1434     .clampScalar(0, S16, S32);
1435
1436   SextInReg
1437     .scalarize(0)
1438     .clampScalar(0, S32, S64)
1439     .lower();
1440
1441   getActionDefinitionsBuilder(G_FSHR)
1442     .legalFor({{S32, S32}})
1443     .scalarize(0)
1444     .lower();
1445
1446   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1447     .legalFor({S64});
1448
1449   getActionDefinitionsBuilder({
1450       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1451       G_FCOPYSIGN,
1452
1453       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1454       G_READ_REGISTER,
1455       G_WRITE_REGISTER,
1456
1457       G_SADDO, G_SSUBO,
1458
1459        // TODO: Implement
1460       G_FMINIMUM, G_FMAXIMUM,
1461       G_FSHL
1462     }).lower();
1463
1464   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1465         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1466         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1467     .unsupported();
1468
1469   computeTables();
1470   verify(*ST.getInstrInfo());
1471 }
1472
1473 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1474                                          MachineInstr &MI) const {
1475   MachineIRBuilder &B = Helper.MIRBuilder;
1476   MachineRegisterInfo &MRI = *B.getMRI();
1477   GISelChangeObserver &Observer = Helper.Observer;
1478
1479   switch (MI.getOpcode()) {
1480   case TargetOpcode::G_ADDRSPACE_CAST:
1481     return legalizeAddrSpaceCast(MI, MRI, B);
1482   case TargetOpcode::G_FRINT:
1483     return legalizeFrint(MI, MRI, B);
1484   case TargetOpcode::G_FCEIL:
1485     return legalizeFceil(MI, MRI, B);
1486   case TargetOpcode::G_INTRINSIC_TRUNC:
1487     return legalizeIntrinsicTrunc(MI, MRI, B);
1488   case TargetOpcode::G_SITOFP:
1489     return legalizeITOFP(MI, MRI, B, true);
1490   case TargetOpcode::G_UITOFP:
1491     return legalizeITOFP(MI, MRI, B, false);
1492   case TargetOpcode::G_FPTOSI:
1493     return legalizeFPTOI(MI, MRI, B, true);
1494   case TargetOpcode::G_FPTOUI:
1495     return legalizeFPTOI(MI, MRI, B, false);
1496   case TargetOpcode::G_FMINNUM:
1497   case TargetOpcode::G_FMAXNUM:
1498   case TargetOpcode::G_FMINNUM_IEEE:
1499   case TargetOpcode::G_FMAXNUM_IEEE:
1500     return legalizeMinNumMaxNum(Helper, MI);
1501   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1502     return legalizeExtractVectorElt(MI, MRI, B);
1503   case TargetOpcode::G_INSERT_VECTOR_ELT:
1504     return legalizeInsertVectorElt(MI, MRI, B);
1505   case TargetOpcode::G_SHUFFLE_VECTOR:
1506     return legalizeShuffleVector(MI, MRI, B);
1507   case TargetOpcode::G_FSIN:
1508   case TargetOpcode::G_FCOS:
1509     return legalizeSinCos(MI, MRI, B);
1510   case TargetOpcode::G_GLOBAL_VALUE:
1511     return legalizeGlobalValue(MI, MRI, B);
1512   case TargetOpcode::G_LOAD:
1513     return legalizeLoad(MI, MRI, B, Observer);
1514   case TargetOpcode::G_FMAD:
1515     return legalizeFMad(MI, MRI, B);
1516   case TargetOpcode::G_FDIV:
1517     return legalizeFDIV(MI, MRI, B);
1518   case TargetOpcode::G_UDIV:
1519   case TargetOpcode::G_UREM:
1520     return legalizeUDIV_UREM(MI, MRI, B);
1521   case TargetOpcode::G_SDIV:
1522   case TargetOpcode::G_SREM:
1523     return legalizeSDIV_SREM(MI, MRI, B);
1524   case TargetOpcode::G_ATOMIC_CMPXCHG:
1525     return legalizeAtomicCmpXChg(MI, MRI, B);
1526   case TargetOpcode::G_FLOG:
1527     return legalizeFlog(MI, B, numbers::ln2f);
1528   case TargetOpcode::G_FLOG10:
1529     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1530   case TargetOpcode::G_FEXP:
1531     return legalizeFExp(MI, B);
1532   case TargetOpcode::G_FPOW:
1533     return legalizeFPow(MI, B);
1534   case TargetOpcode::G_FFLOOR:
1535     return legalizeFFloor(MI, MRI, B);
1536   case TargetOpcode::G_BUILD_VECTOR:
1537     return legalizeBuildVector(MI, MRI, B);
1538   default:
1539     return false;
1540   }
1541
1542   llvm_unreachable("expected switch to return");
1543 }
1544
1545 Register AMDGPULegalizerInfo::getSegmentAperture(
1546   unsigned AS,
1547   MachineRegisterInfo &MRI,
1548   MachineIRBuilder &B) const {
1549   MachineFunction &MF = B.getMF();
1550   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1551   const LLT S32 = LLT::scalar(32);
1552
1553   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1554
1555   if (ST.hasApertureRegs()) {
1556     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1557     // getreg.
1558     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1559         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1560         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1561     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1562         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1563         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1564     unsigned Encoding =
1565         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1566         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1567         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1568
1569     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1570
1571     B.buildInstr(AMDGPU::S_GETREG_B32)
1572       .addDef(GetReg)
1573       .addImm(Encoding);
1574     MRI.setType(GetReg, S32);
1575
1576     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1577     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1578   }
1579
1580   Register QueuePtr = MRI.createGenericVirtualRegister(
1581     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1582
1583   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1584   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1585     return Register();
1586
1587   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1588   // private_segment_aperture_base_hi.
1589   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1590
1591   // TODO: can we be smarter about machine pointer info?
1592   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1593   MachineMemOperand *MMO = MF.getMachineMemOperand(
1594       PtrInfo,
1595       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1596           MachineMemOperand::MOInvariant,
1597       4, commonAlignment(Align(64), StructOffset));
1598
1599   Register LoadAddr;
1600
1601   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1602   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1603 }
1604
1605 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1606   MachineInstr &MI, MachineRegisterInfo &MRI,
1607   MachineIRBuilder &B) const {
1608   MachineFunction &MF = B.getMF();
1609
1610   const LLT S32 = LLT::scalar(32);
1611   Register Dst = MI.getOperand(0).getReg();
1612   Register Src = MI.getOperand(1).getReg();
1613
1614   LLT DstTy = MRI.getType(Dst);
1615   LLT SrcTy = MRI.getType(Src);
1616   unsigned DestAS = DstTy.getAddressSpace();
1617   unsigned SrcAS = SrcTy.getAddressSpace();
1618
1619   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1620   // vector element.
1621   assert(!DstTy.isVector());
1622
1623   const AMDGPUTargetMachine &TM
1624     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1625
1626   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1627   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1628     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1629     return true;
1630   }
1631
1632   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1633     // Truncate.
1634     B.buildExtract(Dst, Src, 0);
1635     MI.eraseFromParent();
1636     return true;
1637   }
1638
1639   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1640     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1641     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1642
1643     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1644     // another. Merge operands are required to be the same type, but creating an
1645     // extra ptrtoint would be kind of pointless.
1646     auto HighAddr = B.buildConstant(
1647       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1648     B.buildMerge(Dst, {Src, HighAddr});
1649     MI.eraseFromParent();
1650     return true;
1651   }
1652
1653   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1654     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1655            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1656     unsigned NullVal = TM.getNullPointerValue(DestAS);
1657
1658     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1659     auto FlatNull = B.buildConstant(SrcTy, 0);
1660
1661     // Extract low 32-bits of the pointer.
1662     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1663
1664     auto CmpRes =
1665         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1666     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1667
1668     MI.eraseFromParent();
1669     return true;
1670   }
1671
1672   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1673     return false;
1674
1675   if (!ST.hasFlatAddressSpace())
1676     return false;
1677
1678   auto SegmentNull =
1679       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1680   auto FlatNull =
1681       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1682
1683   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1684   if (!ApertureReg.isValid())
1685     return false;
1686
1687   auto CmpRes =
1688       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1689
1690   // Coerce the type of the low half of the result so we can use merge_values.
1691   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1692
1693   // TODO: Should we allow mismatched types but matching sizes in merges to
1694   // avoid the ptrtoint?
1695   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1696   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1697
1698   MI.eraseFromParent();
1699   return true;
1700 }
1701
1702 bool AMDGPULegalizerInfo::legalizeFrint(
1703   MachineInstr &MI, MachineRegisterInfo &MRI,
1704   MachineIRBuilder &B) const {
1705   Register Src = MI.getOperand(1).getReg();
1706   LLT Ty = MRI.getType(Src);
1707   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1708
1709   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1710   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1711
1712   auto C1 = B.buildFConstant(Ty, C1Val);
1713   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1714
1715   // TODO: Should this propagate fast-math-flags?
1716   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1717   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1718
1719   auto C2 = B.buildFConstant(Ty, C2Val);
1720   auto Fabs = B.buildFAbs(Ty, Src);
1721
1722   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1723   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1724   return true;
1725 }
1726
1727 bool AMDGPULegalizerInfo::legalizeFceil(
1728   MachineInstr &MI, MachineRegisterInfo &MRI,
1729   MachineIRBuilder &B) const {
1730
1731   const LLT S1 = LLT::scalar(1);
1732   const LLT S64 = LLT::scalar(64);
1733
1734   Register Src = MI.getOperand(1).getReg();
1735   assert(MRI.getType(Src) == S64);
1736
1737   // result = trunc(src)
1738   // if (src > 0.0 && src != result)
1739   //   result += 1.0
1740
1741   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1742
1743   const auto Zero = B.buildFConstant(S64, 0.0);
1744   const auto One = B.buildFConstant(S64, 1.0);
1745   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1746   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1747   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1748   auto Add = B.buildSelect(S64, And, One, Zero);
1749
1750   // TODO: Should this propagate fast-math-flags?
1751   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1752   return true;
1753 }
1754
1755 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1756                                               MachineIRBuilder &B) {
1757   const unsigned FractBits = 52;
1758   const unsigned ExpBits = 11;
1759   LLT S32 = LLT::scalar(32);
1760
1761   auto Const0 = B.buildConstant(S32, FractBits - 32);
1762   auto Const1 = B.buildConstant(S32, ExpBits);
1763
1764   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1765     .addUse(Const0.getReg(0))
1766     .addUse(Const1.getReg(0));
1767
1768   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1769 }
1770
1771 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1772   MachineInstr &MI, MachineRegisterInfo &MRI,
1773   MachineIRBuilder &B) const {
1774   const LLT S1 = LLT::scalar(1);
1775   const LLT S32 = LLT::scalar(32);
1776   const LLT S64 = LLT::scalar(64);
1777
1778   Register Src = MI.getOperand(1).getReg();
1779   assert(MRI.getType(Src) == S64);
1780
1781   // TODO: Should this use extract since the low half is unused?
1782   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1783   Register Hi = Unmerge.getReg(1);
1784
1785   // Extract the upper half, since this is where we will find the sign and
1786   // exponent.
1787   auto Exp = extractF64Exponent(Hi, B);
1788
1789   const unsigned FractBits = 52;
1790
1791   // Extract the sign bit.
1792   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1793   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1794
1795   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1796
1797   const auto Zero32 = B.buildConstant(S32, 0);
1798
1799   // Extend back to 64-bits.
1800   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1801
1802   auto Shr = B.buildAShr(S64, FractMask, Exp);
1803   auto Not = B.buildNot(S64, Shr);
1804   auto Tmp0 = B.buildAnd(S64, Src, Not);
1805   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1806
1807   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1808   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1809
1810   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1811   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1812   return true;
1813 }
1814
1815 bool AMDGPULegalizerInfo::legalizeITOFP(
1816   MachineInstr &MI, MachineRegisterInfo &MRI,
1817   MachineIRBuilder &B, bool Signed) const {
1818
1819   Register Dst = MI.getOperand(0).getReg();
1820   Register Src = MI.getOperand(1).getReg();
1821
1822   const LLT S64 = LLT::scalar(64);
1823   const LLT S32 = LLT::scalar(32);
1824
1825   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1826
1827   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1828
1829   auto CvtHi = Signed ?
1830     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1831     B.buildUITOFP(S64, Unmerge.getReg(1));
1832
1833   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1834
1835   auto ThirtyTwo = B.buildConstant(S32, 32);
1836   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1837     .addUse(CvtHi.getReg(0))
1838     .addUse(ThirtyTwo.getReg(0));
1839
1840   // TODO: Should this propagate fast-math-flags?
1841   B.buildFAdd(Dst, LdExp, CvtLo);
1842   MI.eraseFromParent();
1843   return true;
1844 }
1845
1846 // TODO: Copied from DAG implementation. Verify logic and document how this
1847 // actually works.
1848 bool AMDGPULegalizerInfo::legalizeFPTOI(
1849   MachineInstr &MI, MachineRegisterInfo &MRI,
1850   MachineIRBuilder &B, bool Signed) const {
1851
1852   Register Dst = MI.getOperand(0).getReg();
1853   Register Src = MI.getOperand(1).getReg();
1854
1855   const LLT S64 = LLT::scalar(64);
1856   const LLT S32 = LLT::scalar(32);
1857
1858   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1859
1860   unsigned Flags = MI.getFlags();
1861
1862   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1863   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1864   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1865
1866   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1867   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1868   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1869
1870   auto Hi = Signed ?
1871     B.buildFPTOSI(S32, FloorMul) :
1872     B.buildFPTOUI(S32, FloorMul);
1873   auto Lo = B.buildFPTOUI(S32, Fma);
1874
1875   B.buildMerge(Dst, { Lo, Hi });
1876   MI.eraseFromParent();
1877
1878   return true;
1879 }
1880
1881 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1882                                                MachineInstr &MI) const {
1883   MachineFunction &MF = Helper.MIRBuilder.getMF();
1884   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1885
1886   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1887                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1888
1889   // With ieee_mode disabled, the instructions have the correct behavior
1890   // already for G_FMINNUM/G_FMAXNUM
1891   if (!MFI->getMode().IEEE)
1892     return !IsIEEEOp;
1893
1894   if (IsIEEEOp)
1895     return true;
1896
1897   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1898 }
1899
1900 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1901   MachineInstr &MI, MachineRegisterInfo &MRI,
1902   MachineIRBuilder &B) const {
1903   // TODO: Should move some of this into LegalizerHelper.
1904
1905   // TODO: Promote dynamic indexing of s16 to s32
1906
1907   // FIXME: Artifact combiner probably should have replaced the truncated
1908   // constant before this, so we shouldn't need
1909   // getConstantVRegValWithLookThrough.
1910   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1911     MI.getOperand(2).getReg(), MRI);
1912   if (!IdxVal) // Dynamic case will be selected to register indexing.
1913     return true;
1914
1915   Register Dst = MI.getOperand(0).getReg();
1916   Register Vec = MI.getOperand(1).getReg();
1917
1918   LLT VecTy = MRI.getType(Vec);
1919   LLT EltTy = VecTy.getElementType();
1920   assert(EltTy == MRI.getType(Dst));
1921
1922   if (IdxVal->Value < VecTy.getNumElements())
1923     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1924   else
1925     B.buildUndef(Dst);
1926
1927   MI.eraseFromParent();
1928   return true;
1929 }
1930
1931 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1932   MachineInstr &MI, MachineRegisterInfo &MRI,
1933   MachineIRBuilder &B) const {
1934   // TODO: Should move some of this into LegalizerHelper.
1935
1936   // TODO: Promote dynamic indexing of s16 to s32
1937
1938   // FIXME: Artifact combiner probably should have replaced the truncated
1939   // constant before this, so we shouldn't need
1940   // getConstantVRegValWithLookThrough.
1941   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1942     MI.getOperand(3).getReg(), MRI);
1943   if (!IdxVal) // Dynamic case will be selected to register indexing.
1944     return true;
1945
1946   Register Dst = MI.getOperand(0).getReg();
1947   Register Vec = MI.getOperand(1).getReg();
1948   Register Ins = MI.getOperand(2).getReg();
1949
1950   LLT VecTy = MRI.getType(Vec);
1951   LLT EltTy = VecTy.getElementType();
1952   assert(EltTy == MRI.getType(Ins));
1953
1954   if (IdxVal->Value < VecTy.getNumElements())
1955     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1956   else
1957     B.buildUndef(Dst);
1958
1959   MI.eraseFromParent();
1960   return true;
1961 }
1962
1963 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1964   MachineInstr &MI, MachineRegisterInfo &MRI,
1965   MachineIRBuilder &B) const {
1966   const LLT V2S16 = LLT::vector(2, 16);
1967
1968   Register Dst = MI.getOperand(0).getReg();
1969   Register Src0 = MI.getOperand(1).getReg();
1970   LLT DstTy = MRI.getType(Dst);
1971   LLT SrcTy = MRI.getType(Src0);
1972
1973   if (SrcTy == V2S16 && DstTy == V2S16 &&
1974       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1975     return true;
1976
1977   MachineIRBuilder HelperBuilder(MI);
1978   GISelObserverWrapper DummyObserver;
1979   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1980   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1981 }
1982
1983 bool AMDGPULegalizerInfo::legalizeSinCos(
1984   MachineInstr &MI, MachineRegisterInfo &MRI,
1985   MachineIRBuilder &B) const {
1986
1987   Register DstReg = MI.getOperand(0).getReg();
1988   Register SrcReg = MI.getOperand(1).getReg();
1989   LLT Ty = MRI.getType(DstReg);
1990   unsigned Flags = MI.getFlags();
1991
1992   Register TrigVal;
1993   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1994   if (ST.hasTrigReducedRange()) {
1995     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1996     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1997       .addUse(MulVal.getReg(0))
1998       .setMIFlags(Flags).getReg(0);
1999   } else
2000     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2001
2002   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2003     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2004   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2005     .addUse(TrigVal)
2006     .setMIFlags(Flags);
2007   MI.eraseFromParent();
2008   return true;
2009 }
2010
2011 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2012                                                   MachineIRBuilder &B,
2013                                                   const GlobalValue *GV,
2014                                                   int64_t Offset,
2015                                                   unsigned GAFlags) const {
2016   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2017   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2018   // to the following code sequence:
2019   //
2020   // For constant address space:
2021   //   s_getpc_b64 s[0:1]
2022   //   s_add_u32 s0, s0, $symbol
2023   //   s_addc_u32 s1, s1, 0
2024   //
2025   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2026   //   a fixup or relocation is emitted to replace $symbol with a literal
2027   //   constant, which is a pc-relative offset from the encoding of the $symbol
2028   //   operand to the global variable.
2029   //
2030   // For global address space:
2031   //   s_getpc_b64 s[0:1]
2032   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2033   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2034   //
2035   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2036   //   fixups or relocations are emitted to replace $symbol@*@lo and
2037   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2038   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2039   //   operand to the global variable.
2040   //
2041   // What we want here is an offset from the value returned by s_getpc
2042   // (which is the address of the s_add_u32 instruction) to the global
2043   // variable, but since the encoding of $symbol starts 4 bytes after the start
2044   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2045   // small. This requires us to add 4 to the global variable offset in order to
2046   // compute the correct address.
2047
2048   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2049
2050   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2051     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2052
2053   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2054     .addDef(PCReg);
2055
2056   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2057   if (GAFlags == SIInstrInfo::MO_NONE)
2058     MIB.addImm(0);
2059   else
2060     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2061
2062   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2063
2064   if (PtrTy.getSizeInBits() == 32)
2065     B.buildExtract(DstReg, PCReg, 0);
2066   return true;
2067  }
2068
2069 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2070   MachineInstr &MI, MachineRegisterInfo &MRI,
2071   MachineIRBuilder &B) const {
2072   Register DstReg = MI.getOperand(0).getReg();
2073   LLT Ty = MRI.getType(DstReg);
2074   unsigned AS = Ty.getAddressSpace();
2075
2076   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2077   MachineFunction &MF = B.getMF();
2078   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2079
2080   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2081     if (!MFI->isEntryFunction()) {
2082       const Function &Fn = MF.getFunction();
2083       DiagnosticInfoUnsupported BadLDSDecl(
2084         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2085         DS_Warning);
2086       Fn.getContext().diagnose(BadLDSDecl);
2087
2088       // We currently don't have a way to correctly allocate LDS objects that
2089       // aren't directly associated with a kernel. We do force inlining of
2090       // functions that use local objects. However, if these dead functions are
2091       // not eliminated, we don't want a compile time error. Just emit a warning
2092       // and a trap, since there should be no callable path here.
2093       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2094       B.buildUndef(DstReg);
2095       MI.eraseFromParent();
2096       return true;
2097     }
2098
2099     // TODO: We could emit code to handle the initialization somewhere.
2100     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2101       const SITargetLowering *TLI = ST.getTargetLowering();
2102       if (!TLI->shouldUseLDSConstAddress(GV)) {
2103         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2104         return true; // Leave in place;
2105       }
2106
2107       B.buildConstant(
2108           DstReg,
2109           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2110       MI.eraseFromParent();
2111       return true;
2112     }
2113
2114     const Function &Fn = MF.getFunction();
2115     DiagnosticInfoUnsupported BadInit(
2116       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2117     Fn.getContext().diagnose(BadInit);
2118     return true;
2119   }
2120
2121   const SITargetLowering *TLI = ST.getTargetLowering();
2122
2123   if (TLI->shouldEmitFixup(GV)) {
2124     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2125     MI.eraseFromParent();
2126     return true;
2127   }
2128
2129   if (TLI->shouldEmitPCReloc(GV)) {
2130     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2131     MI.eraseFromParent();
2132     return true;
2133   }
2134
2135   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2136   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2137
2138   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2139       MachinePointerInfo::getGOT(MF),
2140       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2141           MachineMemOperand::MOInvariant,
2142       8 /*Size*/, Align(8));
2143
2144   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2145
2146   if (Ty.getSizeInBits() == 32) {
2147     // Truncate if this is a 32-bit constant adrdess.
2148     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2149     B.buildExtract(DstReg, Load, 0);
2150   } else
2151     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2152
2153   MI.eraseFromParent();
2154   return true;
2155 }
2156
2157 bool AMDGPULegalizerInfo::legalizeLoad(
2158   MachineInstr &MI, MachineRegisterInfo &MRI,
2159   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2160   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2161   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2162   Observer.changingInstr(MI);
2163   MI.getOperand(1).setReg(Cast.getReg(0));
2164   Observer.changedInstr(MI);
2165   return true;
2166 }
2167
2168 bool AMDGPULegalizerInfo::legalizeFMad(
2169   MachineInstr &MI, MachineRegisterInfo &MRI,
2170   MachineIRBuilder &B) const {
2171   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2172   assert(Ty.isScalar());
2173
2174   MachineFunction &MF = B.getMF();
2175   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2176
2177   // TODO: Always legal with future ftz flag.
2178   // FIXME: Do we need just output?
2179   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2180     return true;
2181   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2182     return true;
2183
2184   MachineIRBuilder HelperBuilder(MI);
2185   GISelObserverWrapper DummyObserver;
2186   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2187   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2188 }
2189
2190 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2191   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2192   Register DstReg = MI.getOperand(0).getReg();
2193   Register PtrReg = MI.getOperand(1).getReg();
2194   Register CmpVal = MI.getOperand(2).getReg();
2195   Register NewVal = MI.getOperand(3).getReg();
2196
2197   assert(SITargetLowering::isFlatGlobalAddrSpace(
2198            MRI.getType(PtrReg).getAddressSpace()) &&
2199          "this should not have been custom lowered");
2200
2201   LLT ValTy = MRI.getType(CmpVal);
2202   LLT VecTy = LLT::vector(2, ValTy);
2203
2204   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2205
2206   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2207     .addDef(DstReg)
2208     .addUse(PtrReg)
2209     .addUse(PackedVal)
2210     .setMemRefs(MI.memoperands());
2211
2212   MI.eraseFromParent();
2213   return true;
2214 }
2215
2216 bool AMDGPULegalizerInfo::legalizeFlog(
2217   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2218   Register Dst = MI.getOperand(0).getReg();
2219   Register Src = MI.getOperand(1).getReg();
2220   LLT Ty = B.getMRI()->getType(Dst);
2221   unsigned Flags = MI.getFlags();
2222
2223   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2224   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2225
2226   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2227   MI.eraseFromParent();
2228   return true;
2229 }
2230
2231 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2232                                        MachineIRBuilder &B) const {
2233   Register Dst = MI.getOperand(0).getReg();
2234   Register Src = MI.getOperand(1).getReg();
2235   unsigned Flags = MI.getFlags();
2236   LLT Ty = B.getMRI()->getType(Dst);
2237
2238   auto K = B.buildFConstant(Ty, numbers::log2e);
2239   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2240   B.buildFExp2(Dst, Mul, Flags);
2241   MI.eraseFromParent();
2242   return true;
2243 }
2244
2245 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2246                                        MachineIRBuilder &B) const {
2247   Register Dst = MI.getOperand(0).getReg();
2248   Register Src0 = MI.getOperand(1).getReg();
2249   Register Src1 = MI.getOperand(2).getReg();
2250   unsigned Flags = MI.getFlags();
2251   LLT Ty = B.getMRI()->getType(Dst);
2252   const LLT S16 = LLT::scalar(16);
2253   const LLT S32 = LLT::scalar(32);
2254
2255   if (Ty == S32) {
2256     auto Log = B.buildFLog2(S32, Src0, Flags);
2257     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2258       .addUse(Log.getReg(0))
2259       .addUse(Src1)
2260       .setMIFlags(Flags);
2261     B.buildFExp2(Dst, Mul, Flags);
2262   } else if (Ty == S16) {
2263     // There's no f16 fmul_legacy, so we need to convert for it.
2264     auto Log = B.buildFLog2(S16, Src0, Flags);
2265     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2266     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2267     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2268       .addUse(Ext0.getReg(0))
2269       .addUse(Ext1.getReg(0))
2270       .setMIFlags(Flags);
2271
2272     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2273   } else
2274     return false;
2275
2276   MI.eraseFromParent();
2277   return true;
2278 }
2279
2280 // Find a source register, ignoring any possible source modifiers.
2281 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2282   Register ModSrc = OrigSrc;
2283   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2284     ModSrc = SrcFNeg->getOperand(1).getReg();
2285     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2286       ModSrc = SrcFAbs->getOperand(1).getReg();
2287   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2288     ModSrc = SrcFAbs->getOperand(1).getReg();
2289   return ModSrc;
2290 }
2291
2292 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2293                                          MachineRegisterInfo &MRI,
2294                                          MachineIRBuilder &B) const {
2295
2296   const LLT S1 = LLT::scalar(1);
2297   const LLT S64 = LLT::scalar(64);
2298   Register Dst = MI.getOperand(0).getReg();
2299   Register OrigSrc = MI.getOperand(1).getReg();
2300   unsigned Flags = MI.getFlags();
2301   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2302          "this should not have been custom lowered");
2303
2304   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2305   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2306   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2307   // V_FRACT bug is:
2308   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2309   //
2310   // Convert floor(x) to (x - fract(x))
2311
2312   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2313     .addUse(OrigSrc)
2314     .setMIFlags(Flags);
2315
2316   // Give source modifier matching some assistance before obscuring a foldable
2317   // pattern.
2318
2319   // TODO: We can avoid the neg on the fract? The input sign to fract
2320   // shouldn't matter?
2321   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2322
2323   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2324
2325   Register Min = MRI.createGenericVirtualRegister(S64);
2326
2327   // We don't need to concern ourselves with the snan handling difference, so
2328   // use the one which will directly select.
2329   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2330   if (MFI->getMode().IEEE)
2331     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2332   else
2333     B.buildFMinNum(Min, Fract, Const, Flags);
2334
2335   Register CorrectedFract = Min;
2336   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2337     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2338     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2339   }
2340
2341   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2342   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2343
2344   MI.eraseFromParent();
2345   return true;
2346 }
2347
2348 // Turn an illegal packed v2s16 build vector into bit operations.
2349 // TODO: This should probably be a bitcast action in LegalizerHelper.
2350 bool AMDGPULegalizerInfo::legalizeBuildVector(
2351   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2352   Register Dst = MI.getOperand(0).getReg();
2353   const LLT S32 = LLT::scalar(32);
2354   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2355
2356   Register Src0 = MI.getOperand(1).getReg();
2357   Register Src1 = MI.getOperand(2).getReg();
2358   assert(MRI.getType(Src0) == LLT::scalar(16));
2359
2360   auto Merge = B.buildMerge(S32, {Src0, Src1});
2361   B.buildBitcast(Dst, Merge);
2362
2363   MI.eraseFromParent();
2364   return true;
2365 }
2366
2367 // Return the use branch instruction, otherwise null if the usage is invalid.
2368 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2369                                        MachineRegisterInfo &MRI,
2370                                        MachineInstr *&Br,
2371                                        MachineBasicBlock *&UncondBrTarget) {
2372   Register CondDef = MI.getOperand(0).getReg();
2373   if (!MRI.hasOneNonDBGUse(CondDef))
2374     return nullptr;
2375
2376   MachineBasicBlock *Parent = MI.getParent();
2377   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2378   if (UseMI.getParent() != Parent ||
2379       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2380     return nullptr;
2381
2382   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2383   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2384   if (Next == Parent->end()) {
2385     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2386     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2387       return nullptr;
2388     UncondBrTarget = &*NextMBB;
2389   } else {
2390     if (Next->getOpcode() != AMDGPU::G_BR)
2391       return nullptr;
2392     Br = &*Next;
2393     UncondBrTarget = Br->getOperand(0).getMBB();
2394   }
2395
2396   return &UseMI;
2397 }
2398
2399 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2400                                                MachineRegisterInfo &MRI,
2401                                                Register LiveIn,
2402                                                Register PhyReg) const {
2403   assert(PhyReg.isPhysical() && "Physical register expected");
2404
2405   // Insert the live-in copy, if required, by defining destination virtual
2406   // register.
2407   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2408   if (!MRI.getVRegDef(LiveIn)) {
2409     // FIXME: Should have scoped insert pt
2410     MachineBasicBlock &OrigInsBB = B.getMBB();
2411     auto OrigInsPt = B.getInsertPt();
2412
2413     MachineBasicBlock &EntryMBB = B.getMF().front();
2414     EntryMBB.addLiveIn(PhyReg);
2415     B.setInsertPt(EntryMBB, EntryMBB.begin());
2416     B.buildCopy(LiveIn, PhyReg);
2417
2418     B.setInsertPt(OrigInsBB, OrigInsPt);
2419   }
2420
2421   return LiveIn;
2422 }
2423
2424 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2425                                                 MachineRegisterInfo &MRI,
2426                                                 Register PhyReg, LLT Ty,
2427                                                 bool InsertLiveInCopy) const {
2428   assert(PhyReg.isPhysical() && "Physical register expected");
2429
2430   // Get or create virtual live-in regester
2431   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2432   if (!LiveIn) {
2433     LiveIn = MRI.createGenericVirtualRegister(Ty);
2434     MRI.addLiveIn(PhyReg, LiveIn);
2435   }
2436
2437   // When the actual true copy required is from virtual register to physical
2438   // register (to be inserted later), live-in copy insertion from physical
2439   // to register virtual register is not required
2440   if (!InsertLiveInCopy)
2441     return LiveIn;
2442
2443   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2444 }
2445
2446 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2447     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2448   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2449   const ArgDescriptor *Arg;
2450   const TargetRegisterClass *RC;
2451   LLT ArgTy;
2452   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2453   if (!Arg) {
2454     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2455     return nullptr;
2456   }
2457   return Arg;
2458 }
2459
2460 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2461                                          const ArgDescriptor *Arg) const {
2462   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2463     return false; // TODO: Handle these
2464
2465   Register SrcReg = Arg->getRegister();
2466   assert(SrcReg.isPhysical() && "Physical register expected");
2467   assert(DstReg.isVirtual() && "Virtual register expected");
2468
2469   MachineRegisterInfo &MRI = *B.getMRI();
2470
2471   LLT Ty = MRI.getType(DstReg);
2472   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2473
2474   if (Arg->isMasked()) {
2475     // TODO: Should we try to emit this once in the entry block?
2476     const LLT S32 = LLT::scalar(32);
2477     const unsigned Mask = Arg->getMask();
2478     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2479
2480     Register AndMaskSrc = LiveIn;
2481
2482     if (Shift != 0) {
2483       auto ShiftAmt = B.buildConstant(S32, Shift);
2484       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2485     }
2486
2487     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2488   } else {
2489     B.buildCopy(DstReg, LiveIn);
2490   }
2491
2492   return true;
2493 }
2494
2495 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2496     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2497     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2498
2499   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2500   if (!Arg)
2501     return false;
2502
2503   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2504     return false;
2505
2506   MI.eraseFromParent();
2507   return true;
2508 }
2509
2510 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2511                                        MachineRegisterInfo &MRI,
2512                                        MachineIRBuilder &B) const {
2513   Register Dst = MI.getOperand(0).getReg();
2514   LLT DstTy = MRI.getType(Dst);
2515   LLT S16 = LLT::scalar(16);
2516   LLT S32 = LLT::scalar(32);
2517   LLT S64 = LLT::scalar(64);
2518
2519   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2520     return true;
2521
2522   if (DstTy == S16)
2523     return legalizeFDIV16(MI, MRI, B);
2524   if (DstTy == S32)
2525     return legalizeFDIV32(MI, MRI, B);
2526   if (DstTy == S64)
2527     return legalizeFDIV64(MI, MRI, B);
2528
2529   return false;
2530 }
2531
2532 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2533                                                   Register DstReg,
2534                                                   Register X,
2535                                                   Register Y,
2536                                                   bool IsDiv) const {
2537   const LLT S1 = LLT::scalar(1);
2538   const LLT S32 = LLT::scalar(32);
2539
2540   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2541   // algorithm used here.
2542
2543   // Initial estimate of inv(y).
2544   auto FloatY = B.buildUITOFP(S32, Y);
2545   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2546   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2547   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2548   auto Z = B.buildFPTOUI(S32, ScaledY);
2549
2550   // One round of UNR.
2551   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2552   auto NegYZ = B.buildMul(S32, NegY, Z);
2553   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2554
2555   // Quotient/remainder estimate.
2556   auto Q = B.buildUMulH(S32, X, Z);
2557   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2558
2559   // First quotient/remainder refinement.
2560   auto One = B.buildConstant(S32, 1);
2561   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2562   if (IsDiv)
2563     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2564   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2565
2566   // Second quotient/remainder refinement.
2567   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2568   if (IsDiv)
2569     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2570   else
2571     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2572 }
2573
2574 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2575                                               MachineRegisterInfo &MRI,
2576                                               MachineIRBuilder &B) const {
2577   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2578   Register DstReg = MI.getOperand(0).getReg();
2579   Register Num = MI.getOperand(1).getReg();
2580   Register Den = MI.getOperand(2).getReg();
2581   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2582   MI.eraseFromParent();
2583   return true;
2584 }
2585
2586 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2587 //
2588 // Return lo, hi of result
2589 //
2590 // %cvt.lo = G_UITOFP Val.lo
2591 // %cvt.hi = G_UITOFP Val.hi
2592 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2593 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2594 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2595 // %mul2 = G_FMUL %mul1, 2**(-32)
2596 // %trunc = G_INTRINSIC_TRUNC %mul2
2597 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2598 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2599 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2600                                                        Register Val) {
2601   const LLT S32 = LLT::scalar(32);
2602   auto Unmerge = B.buildUnmerge(S32, Val);
2603
2604   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2605   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2606
2607   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2608                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2609
2610   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2611   auto Mul1 =
2612       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2613
2614   // 2**(-32)
2615   auto Mul2 =
2616       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2617   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2618
2619   // -(2**32)
2620   auto Mad2 = B.buildFMAD(S32, Trunc,
2621                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2622
2623   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2624   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2625
2626   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2627 }
2628
2629 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2630                                                   Register DstReg,
2631                                                   Register Numer,
2632                                                   Register Denom,
2633                                                   bool IsDiv) const {
2634   const LLT S32 = LLT::scalar(32);
2635   const LLT S64 = LLT::scalar(64);
2636   const LLT S1 = LLT::scalar(1);
2637   Register RcpLo, RcpHi;
2638
2639   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2640
2641   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2642
2643   auto Zero64 = B.buildConstant(S64, 0);
2644   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2645
2646   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2647   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2648
2649   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2650   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2651   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2652
2653   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2654   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2655   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2656   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2657
2658   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2659   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2660   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2661   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2662   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2663
2664   auto Zero32 = B.buildConstant(S32, 0);
2665   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2666   auto Add2_HiC =
2667       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2668   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2669   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2670
2671   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2672   Register NumerLo = UnmergeNumer.getReg(0);
2673   Register NumerHi = UnmergeNumer.getReg(1);
2674
2675   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2676   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2677   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2678   Register Mul3_Lo = UnmergeMul3.getReg(0);
2679   Register Mul3_Hi = UnmergeMul3.getReg(1);
2680   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2681   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2682   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2683   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2684
2685   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2686   Register DenomLo = UnmergeDenom.getReg(0);
2687   Register DenomHi = UnmergeDenom.getReg(1);
2688
2689   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2690   auto C1 = B.buildSExt(S32, CmpHi);
2691
2692   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2693   auto C2 = B.buildSExt(S32, CmpLo);
2694
2695   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2696   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2697
2698   // TODO: Here and below portions of the code can be enclosed into if/endif.
2699   // Currently control flow is unconditional and we have 4 selects after
2700   // potential endif to substitute PHIs.
2701
2702   // if C3 != 0 ...
2703   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2704   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2705   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2706   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2707
2708   auto One64 = B.buildConstant(S64, 1);
2709   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2710
2711   auto C4 =
2712       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2713   auto C5 =
2714       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2715   auto C6 = B.buildSelect(
2716       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2717
2718   // if (C6 != 0)
2719   auto Add4 = B.buildAdd(S64, Add3, One64);
2720   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2721
2722   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2723   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2724   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2725
2726   // endif C6
2727   // endif C3
2728
2729   if (IsDiv) {
2730     auto Sel1 = B.buildSelect(
2731         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2732     B.buildSelect(DstReg,
2733                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2734   } else {
2735     auto Sel2 = B.buildSelect(
2736         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2737     B.buildSelect(DstReg,
2738                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2739   }
2740 }
2741
2742 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2743                                             MachineRegisterInfo &MRI,
2744                                             MachineIRBuilder &B) const {
2745   const LLT S64 = LLT::scalar(64);
2746   const LLT S32 = LLT::scalar(32);
2747   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2748   Register DstReg = MI.getOperand(0).getReg();
2749   Register Num = MI.getOperand(1).getReg();
2750   Register Den = MI.getOperand(2).getReg();
2751   LLT Ty = MRI.getType(DstReg);
2752
2753   if (Ty == S32)
2754     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2755   else if (Ty == S64)
2756     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2757   else
2758     return false;
2759
2760   MI.eraseFromParent();
2761   return true;
2762
2763 }
2764
2765 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2766                                             MachineRegisterInfo &MRI,
2767                                             MachineIRBuilder &B) const {
2768   const LLT S64 = LLT::scalar(64);
2769   const LLT S32 = LLT::scalar(32);
2770
2771   Register DstReg = MI.getOperand(0).getReg();
2772   const LLT Ty = MRI.getType(DstReg);
2773   if (Ty != S32 && Ty != S64)
2774     return false;
2775
2776   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2777
2778   Register LHS = MI.getOperand(1).getReg();
2779   Register RHS = MI.getOperand(2).getReg();
2780
2781   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2782   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2783   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2784
2785   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2786   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2787
2788   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2789   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2790
2791   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2792   if (Ty == S32)
2793     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2794   else
2795     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2796
2797   Register Sign;
2798   if (IsDiv)
2799     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2800   else
2801     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2802
2803   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2804   B.buildSub(DstReg, UDivRem, Sign);
2805
2806   MI.eraseFromParent();
2807   return true;
2808 }
2809
2810 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2811                                                  MachineRegisterInfo &MRI,
2812                                                  MachineIRBuilder &B) const {
2813   Register Res = MI.getOperand(0).getReg();
2814   Register LHS = MI.getOperand(1).getReg();
2815   Register RHS = MI.getOperand(2).getReg();
2816
2817   uint16_t Flags = MI.getFlags();
2818
2819   LLT ResTy = MRI.getType(Res);
2820   LLT S32 = LLT::scalar(32);
2821   LLT S64 = LLT::scalar(64);
2822
2823   const MachineFunction &MF = B.getMF();
2824   bool Unsafe =
2825     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2826
2827   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2828     return false;
2829
2830   if (!Unsafe && ResTy == S32 &&
2831       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2832     return false;
2833
2834   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2835     // 1 / x -> RCP(x)
2836     if (CLHS->isExactlyValue(1.0)) {
2837       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2838         .addUse(RHS)
2839         .setMIFlags(Flags);
2840
2841       MI.eraseFromParent();
2842       return true;
2843     }
2844
2845     // -1 / x -> RCP( FNEG(x) )
2846     if (CLHS->isExactlyValue(-1.0)) {
2847       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2848       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2849         .addUse(FNeg.getReg(0))
2850         .setMIFlags(Flags);
2851
2852       MI.eraseFromParent();
2853       return true;
2854     }
2855   }
2856
2857   // x / y -> x * (1.0 / y)
2858   if (Unsafe) {
2859     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2860       .addUse(RHS)
2861       .setMIFlags(Flags);
2862     B.buildFMul(Res, LHS, RCP, Flags);
2863
2864     MI.eraseFromParent();
2865     return true;
2866   }
2867
2868   return false;
2869 }
2870
2871 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2872                                          MachineRegisterInfo &MRI,
2873                                          MachineIRBuilder &B) const {
2874   Register Res = MI.getOperand(0).getReg();
2875   Register LHS = MI.getOperand(1).getReg();
2876   Register RHS = MI.getOperand(2).getReg();
2877
2878   uint16_t Flags = MI.getFlags();
2879
2880   LLT S16 = LLT::scalar(16);
2881   LLT S32 = LLT::scalar(32);
2882
2883   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2884   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2885
2886   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2887     .addUse(RHSExt.getReg(0))
2888     .setMIFlags(Flags);
2889
2890   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2891   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2892
2893   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2894     .addUse(RDst.getReg(0))
2895     .addUse(RHS)
2896     .addUse(LHS)
2897     .setMIFlags(Flags);
2898
2899   MI.eraseFromParent();
2900   return true;
2901 }
2902
2903 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2904 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2905 static void toggleSPDenormMode(bool Enable,
2906                                MachineIRBuilder &B,
2907                                const GCNSubtarget &ST,
2908                                AMDGPU::SIModeRegisterDefaults Mode) {
2909   // Set SP denorm mode to this value.
2910   unsigned SPDenormMode =
2911     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2912
2913   if (ST.hasDenormModeInst()) {
2914     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2915     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2916
2917     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2918     B.buildInstr(AMDGPU::S_DENORM_MODE)
2919       .addImm(NewDenormModeValue);
2920
2921   } else {
2922     // Select FP32 bit field in mode register.
2923     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2924                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2925                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2926
2927     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2928       .addImm(SPDenormMode)
2929       .addImm(SPDenormModeBitField);
2930   }
2931 }
2932
2933 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2934                                          MachineRegisterInfo &MRI,
2935                                          MachineIRBuilder &B) const {
2936   Register Res = MI.getOperand(0).getReg();
2937   Register LHS = MI.getOperand(1).getReg();
2938   Register RHS = MI.getOperand(2).getReg();
2939   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2940   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2941
2942   uint16_t Flags = MI.getFlags();
2943
2944   LLT S32 = LLT::scalar(32);
2945   LLT S1 = LLT::scalar(1);
2946
2947   auto One = B.buildFConstant(S32, 1.0f);
2948
2949   auto DenominatorScaled =
2950     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2951       .addUse(LHS)
2952       .addUse(RHS)
2953       .addImm(0)
2954       .setMIFlags(Flags);
2955   auto NumeratorScaled =
2956     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2957       .addUse(LHS)
2958       .addUse(RHS)
2959       .addImm(1)
2960       .setMIFlags(Flags);
2961
2962   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2963     .addUse(DenominatorScaled.getReg(0))
2964     .setMIFlags(Flags);
2965   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2966
2967   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2968   // aren't modeled as reading it.
2969   if (!Mode.allFP32Denormals())
2970     toggleSPDenormMode(true, B, ST, Mode);
2971
2972   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2973   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2974   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2975   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2976   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2977   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2978
2979   if (!Mode.allFP32Denormals())
2980     toggleSPDenormMode(false, B, ST, Mode);
2981
2982   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2983     .addUse(Fma4.getReg(0))
2984     .addUse(Fma1.getReg(0))
2985     .addUse(Fma3.getReg(0))
2986     .addUse(NumeratorScaled.getReg(1))
2987     .setMIFlags(Flags);
2988
2989   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2990     .addUse(Fmas.getReg(0))
2991     .addUse(RHS)
2992     .addUse(LHS)
2993     .setMIFlags(Flags);
2994
2995   MI.eraseFromParent();
2996   return true;
2997 }
2998
2999 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3000                                          MachineRegisterInfo &MRI,
3001                                          MachineIRBuilder &B) const {
3002   Register Res = MI.getOperand(0).getReg();
3003   Register LHS = MI.getOperand(1).getReg();
3004   Register RHS = MI.getOperand(2).getReg();
3005
3006   uint16_t Flags = MI.getFlags();
3007
3008   LLT S64 = LLT::scalar(64);
3009   LLT S1 = LLT::scalar(1);
3010
3011   auto One = B.buildFConstant(S64, 1.0);
3012
3013   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3014     .addUse(LHS)
3015     .addUse(RHS)
3016     .addImm(0)
3017     .setMIFlags(Flags);
3018
3019   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3020
3021   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3022     .addUse(DivScale0.getReg(0))
3023     .setMIFlags(Flags);
3024
3025   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3026   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3027   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3028
3029   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3030     .addUse(LHS)
3031     .addUse(RHS)
3032     .addImm(1)
3033     .setMIFlags(Flags);
3034
3035   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3036   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3037   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3038
3039   Register Scale;
3040   if (!ST.hasUsableDivScaleConditionOutput()) {
3041     // Workaround a hardware bug on SI where the condition output from div_scale
3042     // is not usable.
3043
3044     LLT S32 = LLT::scalar(32);
3045
3046     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3047     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3048     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3049     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3050
3051     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3052                               Scale1Unmerge.getReg(1));
3053     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3054                               Scale0Unmerge.getReg(1));
3055     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3056   } else {
3057     Scale = DivScale1.getReg(1);
3058   }
3059
3060   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3061     .addUse(Fma4.getReg(0))
3062     .addUse(Fma3.getReg(0))
3063     .addUse(Mul.getReg(0))
3064     .addUse(Scale)
3065     .setMIFlags(Flags);
3066
3067   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3068     .addUse(Fmas.getReg(0))
3069     .addUse(RHS)
3070     .addUse(LHS)
3071     .setMIFlags(Flags);
3072
3073   MI.eraseFromParent();
3074   return true;
3075 }
3076
3077 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3078                                                  MachineRegisterInfo &MRI,
3079                                                  MachineIRBuilder &B) const {
3080   Register Res = MI.getOperand(0).getReg();
3081   Register LHS = MI.getOperand(2).getReg();
3082   Register RHS = MI.getOperand(3).getReg();
3083   uint16_t Flags = MI.getFlags();
3084
3085   LLT S32 = LLT::scalar(32);
3086   LLT S1 = LLT::scalar(1);
3087
3088   auto Abs = B.buildFAbs(S32, RHS, Flags);
3089   const APFloat C0Val(1.0f);
3090
3091   auto C0 = B.buildConstant(S32, 0x6f800000);
3092   auto C1 = B.buildConstant(S32, 0x2f800000);
3093   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3094
3095   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3096   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3097
3098   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3099
3100   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3101     .addUse(Mul0.getReg(0))
3102     .setMIFlags(Flags);
3103
3104   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3105
3106   B.buildFMul(Res, Sel, Mul1, Flags);
3107
3108   MI.eraseFromParent();
3109   return true;
3110 }
3111
3112 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3113                                                  MachineRegisterInfo &MRI,
3114                                                  MachineIRBuilder &B) const {
3115   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3116   if (!MFI->isEntryFunction()) {
3117     return legalizePreloadedArgIntrin(MI, MRI, B,
3118                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3119   }
3120
3121   uint64_t Offset =
3122     ST.getTargetLowering()->getImplicitParameterOffset(
3123       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3124   Register DstReg = MI.getOperand(0).getReg();
3125   LLT DstTy = MRI.getType(DstReg);
3126   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3127
3128   const ArgDescriptor *Arg;
3129   const TargetRegisterClass *RC;
3130   LLT ArgTy;
3131   std::tie(Arg, RC, ArgTy) =
3132       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3133   if (!Arg)
3134     return false;
3135
3136   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3137   if (!loadInputValue(KernargPtrReg, B, Arg))
3138     return false;
3139
3140   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3141   MI.eraseFromParent();
3142   return true;
3143 }
3144
3145 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3146                                               MachineRegisterInfo &MRI,
3147                                               MachineIRBuilder &B,
3148                                               unsigned AddrSpace) const {
3149   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3150   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3151   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3152   MI.eraseFromParent();
3153   return true;
3154 }
3155
3156 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3157 // offset (the offset that is included in bounds checking and swizzling, to be
3158 // split between the instruction's voffset and immoffset fields) and soffset
3159 // (the offset that is excluded from bounds checking and swizzling, to go in
3160 // the instruction's soffset field).  This function takes the first kind of
3161 // offset and figures out how to split it between voffset and immoffset.
3162 std::tuple<Register, unsigned, unsigned>
3163 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3164                                         Register OrigOffset) const {
3165   const unsigned MaxImm = 4095;
3166   Register BaseReg;
3167   unsigned TotalConstOffset;
3168   MachineInstr *OffsetDef;
3169   const LLT S32 = LLT::scalar(32);
3170
3171   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3172     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3173
3174   unsigned ImmOffset = TotalConstOffset;
3175
3176   // If the immediate value is too big for the immoffset field, put the value
3177   // and -4096 into the immoffset field so that the value that is copied/added
3178   // for the voffset field is a multiple of 4096, and it stands more chance
3179   // of being CSEd with the copy/add for another similar load/store.
3180   // However, do not do that rounding down to a multiple of 4096 if that is a
3181   // negative number, as it appears to be illegal to have a negative offset
3182   // in the vgpr, even if adding the immediate offset makes it positive.
3183   unsigned Overflow = ImmOffset & ~MaxImm;
3184   ImmOffset -= Overflow;
3185   if ((int32_t)Overflow < 0) {
3186     Overflow += ImmOffset;
3187     ImmOffset = 0;
3188   }
3189
3190   if (Overflow != 0) {
3191     if (!BaseReg) {
3192       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3193     } else {
3194       auto OverflowVal = B.buildConstant(S32, Overflow);
3195       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3196     }
3197   }
3198
3199   if (!BaseReg)
3200     BaseReg = B.buildConstant(S32, 0).getReg(0);
3201
3202   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3203 }
3204
3205 /// Handle register layout difference for f16 images for some subtargets.
3206 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3207                                              MachineRegisterInfo &MRI,
3208                                              Register Reg) const {
3209   if (!ST.hasUnpackedD16VMem())
3210     return Reg;
3211
3212   const LLT S16 = LLT::scalar(16);
3213   const LLT S32 = LLT::scalar(32);
3214   LLT StoreVT = MRI.getType(Reg);
3215   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3216
3217   auto Unmerge = B.buildUnmerge(S16, Reg);
3218
3219   SmallVector<Register, 4> WideRegs;
3220   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3221     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3222
3223   int NumElts = StoreVT.getNumElements();
3224
3225   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3226 }
3227
3228 Register AMDGPULegalizerInfo::fixStoreSourceType(
3229   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3230   MachineRegisterInfo *MRI = B.getMRI();
3231   LLT Ty = MRI->getType(VData);
3232
3233   const LLT S16 = LLT::scalar(16);
3234
3235   // Fixup illegal register types for i8 stores.
3236   if (Ty == LLT::scalar(8) || Ty == S16) {
3237     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3238     return AnyExt;
3239   }
3240
3241   if (Ty.isVector()) {
3242     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3243       if (IsFormat)
3244         return handleD16VData(B, *MRI, VData);
3245     }
3246   }
3247
3248   return VData;
3249 }
3250
3251 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3252                                               MachineRegisterInfo &MRI,
3253                                               MachineIRBuilder &B,
3254                                               bool IsTyped,
3255                                               bool IsFormat) const {
3256   Register VData = MI.getOperand(1).getReg();
3257   LLT Ty = MRI.getType(VData);
3258   LLT EltTy = Ty.getScalarType();
3259   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3260   const LLT S32 = LLT::scalar(32);
3261
3262   VData = fixStoreSourceType(B, VData, IsFormat);
3263   Register RSrc = MI.getOperand(2).getReg();
3264
3265   MachineMemOperand *MMO = *MI.memoperands_begin();
3266   const int MemSize = MMO->getSize();
3267
3268   unsigned ImmOffset;
3269   unsigned TotalOffset;
3270
3271   // The typed intrinsics add an immediate after the registers.
3272   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3273
3274   // The struct intrinsic variants add one additional operand over raw.
3275   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3276   Register VIndex;
3277   int OpOffset = 0;
3278   if (HasVIndex) {
3279     VIndex = MI.getOperand(3).getReg();
3280     OpOffset = 1;
3281   }
3282
3283   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3284   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3285
3286   unsigned Format = 0;
3287   if (IsTyped) {
3288     Format = MI.getOperand(5 + OpOffset).getImm();
3289     ++OpOffset;
3290   }
3291
3292   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3293
3294   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3295   if (TotalOffset != 0)
3296     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3297
3298   unsigned Opc;
3299   if (IsTyped) {
3300     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3301                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3302   } else if (IsFormat) {
3303     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3304                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3305   } else {
3306     switch (MemSize) {
3307     case 1:
3308       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3309       break;
3310     case 2:
3311       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3312       break;
3313     default:
3314       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3315       break;
3316     }
3317   }
3318
3319   if (!VIndex)
3320     VIndex = B.buildConstant(S32, 0).getReg(0);
3321
3322   auto MIB = B.buildInstr(Opc)
3323     .addUse(VData)              // vdata
3324     .addUse(RSrc)               // rsrc
3325     .addUse(VIndex)             // vindex
3326     .addUse(VOffset)            // voffset
3327     .addUse(SOffset)            // soffset
3328     .addImm(ImmOffset);         // offset(imm)
3329
3330   if (IsTyped)
3331     MIB.addImm(Format);
3332
3333   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3334      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3335      .addMemOperand(MMO);
3336
3337   MI.eraseFromParent();
3338   return true;
3339 }
3340
3341 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3342                                              MachineRegisterInfo &MRI,
3343                                              MachineIRBuilder &B,
3344                                              bool IsFormat,
3345                                              bool IsTyped) const {
3346   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3347   MachineMemOperand *MMO = *MI.memoperands_begin();
3348   const int MemSize = MMO->getSize();
3349   const LLT S32 = LLT::scalar(32);
3350
3351   Register Dst = MI.getOperand(0).getReg();
3352   Register RSrc = MI.getOperand(2).getReg();
3353
3354   // The typed intrinsics add an immediate after the registers.
3355   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3356
3357   // The struct intrinsic variants add one additional operand over raw.
3358   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3359   Register VIndex;
3360   int OpOffset = 0;
3361   if (HasVIndex) {
3362     VIndex = MI.getOperand(3).getReg();
3363     OpOffset = 1;
3364   }
3365
3366   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3367   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3368
3369   unsigned Format = 0;
3370   if (IsTyped) {
3371     Format = MI.getOperand(5 + OpOffset).getImm();
3372     ++OpOffset;
3373   }
3374
3375   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3376   unsigned ImmOffset;
3377   unsigned TotalOffset;
3378
3379   LLT Ty = MRI.getType(Dst);
3380   LLT EltTy = Ty.getScalarType();
3381   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3382   const bool Unpacked = ST.hasUnpackedD16VMem();
3383
3384   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3385   if (TotalOffset != 0)
3386     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3387
3388   unsigned Opc;
3389
3390   if (IsTyped) {
3391     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3392                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3393   } else if (IsFormat) {
3394     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3395                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3396   } else {
3397     switch (MemSize) {
3398     case 1:
3399       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3400       break;
3401     case 2:
3402       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3403       break;
3404     default:
3405       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3406       break;
3407     }
3408   }
3409
3410   Register LoadDstReg;
3411
3412   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3413   LLT UnpackedTy = Ty.changeElementSize(32);
3414
3415   if (IsExtLoad)
3416     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3417   else if (Unpacked && IsD16 && Ty.isVector())
3418     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3419   else
3420     LoadDstReg = Dst;
3421
3422   if (!VIndex)
3423     VIndex = B.buildConstant(S32, 0).getReg(0);
3424
3425   auto MIB = B.buildInstr(Opc)
3426     .addDef(LoadDstReg)         // vdata
3427     .addUse(RSrc)               // rsrc
3428     .addUse(VIndex)             // vindex
3429     .addUse(VOffset)            // voffset
3430     .addUse(SOffset)            // soffset
3431     .addImm(ImmOffset);         // offset(imm)
3432
3433   if (IsTyped)
3434     MIB.addImm(Format);
3435
3436   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3437      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3438      .addMemOperand(MMO);
3439
3440   if (LoadDstReg != Dst) {
3441     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3442
3443     // Widen result for extending loads was widened.
3444     if (IsExtLoad)
3445       B.buildTrunc(Dst, LoadDstReg);
3446     else {
3447       // Repack to original 16-bit vector result
3448       // FIXME: G_TRUNC should work, but legalization currently fails
3449       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3450       SmallVector<Register, 4> Repack;
3451       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3452         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3453       B.buildMerge(Dst, Repack);
3454     }
3455   }
3456
3457   MI.eraseFromParent();
3458   return true;
3459 }
3460
3461 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3462                                                MachineIRBuilder &B,
3463                                                bool IsInc) const {
3464   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3465                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3466   B.buildInstr(Opc)
3467     .addDef(MI.getOperand(0).getReg())
3468     .addUse(MI.getOperand(2).getReg())
3469     .addUse(MI.getOperand(3).getReg())
3470     .cloneMemRefs(MI);
3471   MI.eraseFromParent();
3472   return true;
3473 }
3474
3475 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3476   switch (IntrID) {
3477   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3478   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3479     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3480   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3481   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3482     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3483   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3484   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3485     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3486   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3487   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3488     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3489   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3490   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3491     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3492   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3493   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3494     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3495   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3496   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3497     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3498   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3499   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3500     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3501   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3502   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3503     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3504   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3505   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3506     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3507   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3508   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3509     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3510   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3511   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3512     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3513   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3514   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3515     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3516   default:
3517     llvm_unreachable("unhandled atomic opcode");
3518   }
3519 }
3520
3521 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3522                                                MachineIRBuilder &B,
3523                                                Intrinsic::ID IID) const {
3524   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3525                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3526
3527   Register Dst = MI.getOperand(0).getReg();
3528   Register VData = MI.getOperand(2).getReg();
3529
3530   Register CmpVal;
3531   int OpOffset = 0;
3532
3533   if (IsCmpSwap) {
3534     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3535     ++OpOffset;
3536   }
3537
3538   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3539   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3540
3541   // The struct intrinsic variants add one additional operand over raw.
3542   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3543   Register VIndex;
3544   if (HasVIndex) {
3545     VIndex = MI.getOperand(4 + OpOffset).getReg();
3546     ++OpOffset;
3547   }
3548
3549   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3550   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3551   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3552
3553   MachineMemOperand *MMO = *MI.memoperands_begin();
3554
3555   unsigned ImmOffset;
3556   unsigned TotalOffset;
3557   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3558   if (TotalOffset != 0)
3559     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3560
3561   if (!VIndex)
3562     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3563
3564   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3565     .addDef(Dst)
3566     .addUse(VData); // vdata
3567
3568   if (IsCmpSwap)
3569     MIB.addReg(CmpVal);
3570
3571   MIB.addUse(RSrc)               // rsrc
3572      .addUse(VIndex)             // vindex
3573      .addUse(VOffset)            // voffset
3574      .addUse(SOffset)            // soffset
3575      .addImm(ImmOffset)          // offset(imm)
3576      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3577      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3578      .addMemOperand(MMO);
3579
3580   MI.eraseFromParent();
3581   return true;
3582 }
3583
3584 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3585 /// vector with s16 typed elements.
3586 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3587                                         SmallVectorImpl<Register> &PackedAddrs,
3588                                         int AddrIdx, int DimIdx, int EndIdx,
3589                                         int NumGradients) {
3590   const LLT S16 = LLT::scalar(16);
3591   const LLT V2S16 = LLT::vector(2, 16);
3592
3593   for (int I = AddrIdx; I < EndIdx; ++I) {
3594     MachineOperand &SrcOp = MI.getOperand(I);
3595     if (!SrcOp.isReg())
3596       continue; // _L to _LZ may have eliminated this.
3597
3598     Register AddrReg = SrcOp.getReg();
3599
3600     if (I < DimIdx) {
3601       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3602       PackedAddrs.push_back(AddrReg);
3603     } else {
3604       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3605       // derivatives dx/dh and dx/dv are packed with undef.
3606       if (((I + 1) >= EndIdx) ||
3607           ((NumGradients / 2) % 2 == 1 &&
3608            (I == DimIdx + (NumGradients / 2) - 1 ||
3609             I == DimIdx + NumGradients - 1)) ||
3610           // Check for _L to _LZ optimization
3611           !MI.getOperand(I + 1).isReg()) {
3612         PackedAddrs.push_back(
3613             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3614                 .getReg(0));
3615       } else {
3616         PackedAddrs.push_back(
3617             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3618                 .getReg(0));
3619         ++I;
3620       }
3621     }
3622   }
3623 }
3624
3625 /// Convert from separate vaddr components to a single vector address register,
3626 /// and replace the remaining operands with $noreg.
3627 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3628                                      int DimIdx, int NumVAddrs) {
3629   const LLT S32 = LLT::scalar(32);
3630
3631   SmallVector<Register, 8> AddrRegs;
3632   for (int I = 0; I != NumVAddrs; ++I) {
3633     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3634     if (SrcOp.isReg()) {
3635       AddrRegs.push_back(SrcOp.getReg());
3636       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3637     }
3638   }
3639
3640   int NumAddrRegs = AddrRegs.size();
3641   if (NumAddrRegs != 1) {
3642     // Round up to 8 elements for v5-v7
3643     // FIXME: Missing intermediate sized register classes and instructions.
3644     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3645       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3646       auto Undef = B.buildUndef(S32);
3647       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3648       NumAddrRegs = RoundedNumRegs;
3649     }
3650
3651     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3652     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3653   }
3654
3655   for (int I = 1; I != NumVAddrs; ++I) {
3656     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3657     if (SrcOp.isReg())
3658       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3659   }
3660 }
3661
3662 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3663 ///
3664 /// Depending on the subtarget, load/store with 16-bit element data need to be
3665 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3666 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3667 /// registers.
3668 ///
3669 /// We don't want to directly select image instructions just yet, but also want
3670 /// to exposes all register repacking to the legalizer/combiners. We also don't
3671 /// want a selected instrution entering RegBankSelect. In order to avoid
3672 /// defining a multitude of intermediate image instructions, directly hack on
3673 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3674 /// now unnecessary arguments with $noreg.
3675 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3676     MachineInstr &MI, MachineIRBuilder &B,
3677     GISelChangeObserver &Observer,
3678     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3679
3680   const int NumDefs = MI.getNumExplicitDefs();
3681   bool IsTFE = NumDefs == 2;
3682   // We are only processing the operands of d16 image operations on subtargets
3683   // that use the unpacked register layout, or need to repack the TFE result.
3684
3685   // TODO: Do we need to guard against already legalized intrinsics?
3686   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3687     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3688
3689   MachineRegisterInfo *MRI = B.getMRI();
3690   const LLT S32 = LLT::scalar(32);
3691   const LLT S16 = LLT::scalar(16);
3692   const LLT V2S16 = LLT::vector(2, 16);
3693
3694   // Index of first address argument
3695   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3696
3697   int NumVAddrs, NumGradients;
3698   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3699   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3700     getDMaskIdx(BaseOpcode, NumDefs);
3701   unsigned DMask = 0;
3702
3703   // Check for 16 bit addresses and pack if true.
3704   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3705   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3706   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3707   const bool IsG16 = GradTy == S16;
3708   const bool IsA16 = AddrTy == S16;
3709
3710   int DMaskLanes = 0;
3711   if (!BaseOpcode->Atomic) {
3712     DMask = MI.getOperand(DMaskIdx).getImm();
3713     if (BaseOpcode->Gather4) {
3714       DMaskLanes = 4;
3715     } else if (DMask != 0) {
3716       DMaskLanes = countPopulation(DMask);
3717     } else if (!IsTFE && !BaseOpcode->Store) {
3718       // If dmask is 0, this is a no-op load. This can be eliminated.
3719       B.buildUndef(MI.getOperand(0));
3720       MI.eraseFromParent();
3721       return true;
3722     }
3723   }
3724
3725   Observer.changingInstr(MI);
3726   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3727
3728   unsigned NewOpcode = NumDefs == 0 ?
3729     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3730
3731   // Track that we legalized this
3732   MI.setDesc(B.getTII().get(NewOpcode));
3733
3734   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3735   // dmask to be at least 1 otherwise the instruction will fail
3736   if (IsTFE && DMask == 0) {
3737     DMask = 0x1;
3738     DMaskLanes = 1;
3739     MI.getOperand(DMaskIdx).setImm(DMask);
3740   }
3741
3742   if (BaseOpcode->Atomic) {
3743     Register VData0 = MI.getOperand(2).getReg();
3744     LLT Ty = MRI->getType(VData0);
3745
3746     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3747     if (Ty.isVector())
3748       return false;
3749
3750     if (BaseOpcode->AtomicX2) {
3751       Register VData1 = MI.getOperand(3).getReg();
3752       // The two values are packed in one register.
3753       LLT PackedTy = LLT::vector(2, Ty);
3754       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3755       MI.getOperand(2).setReg(Concat.getReg(0));
3756       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3757     }
3758   }
3759
3760   int CorrectedNumVAddrs = NumVAddrs;
3761
3762   // Optimize _L to _LZ when _L is zero
3763   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3764         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3765     const ConstantFP *ConstantLod;
3766     const int LodIdx = AddrIdx + NumVAddrs - 1;
3767
3768     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3769       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3770         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3771         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3772           LZMappingInfo->LZ, ImageDimIntr->Dim);
3773
3774         // The starting indexes should remain in the same place.
3775         --NumVAddrs;
3776         --CorrectedNumVAddrs;
3777
3778         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3779           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3780         MI.RemoveOperand(LodIdx);
3781       }
3782     }
3783   }
3784
3785   // Optimize _mip away, when 'lod' is zero
3786   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3787     int64_t ConstantLod;
3788     const int LodIdx = AddrIdx + NumVAddrs - 1;
3789
3790     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3791       if (ConstantLod == 0) {
3792         // TODO: Change intrinsic opcode and remove operand instead or replacing
3793         // it with 0, as the _L to _LZ handling is done above.
3794         MI.getOperand(LodIdx).ChangeToImmediate(0);
3795         --CorrectedNumVAddrs;
3796       }
3797     }
3798   }
3799
3800   // Rewrite the addressing register layout before doing anything else.
3801   if (IsA16 || IsG16) {
3802     if (IsA16) {
3803       // Target must support the feature and gradients need to be 16 bit too
3804       if (!ST.hasA16() || !IsG16)
3805         return false;
3806     } else if (!ST.hasG16())
3807       return false;
3808
3809     if (NumVAddrs > 1) {
3810       SmallVector<Register, 4> PackedRegs;
3811       // Don't compress addresses for G16
3812       const int PackEndIdx =
3813           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3814       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3815                                   PackEndIdx, NumGradients);
3816
3817       if (!IsA16) {
3818         // Add uncompressed address
3819         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3820           int AddrReg = MI.getOperand(I).getReg();
3821           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3822           PackedRegs.push_back(AddrReg);
3823         }
3824       }
3825
3826       // See also below in the non-a16 branch
3827       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3828
3829       if (!UseNSA && PackedRegs.size() > 1) {
3830         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3831         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3832         PackedRegs[0] = Concat.getReg(0);
3833         PackedRegs.resize(1);
3834       }
3835
3836       const int NumPacked = PackedRegs.size();
3837       for (int I = 0; I != NumVAddrs; ++I) {
3838         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3839         if (!SrcOp.isReg()) {
3840           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3841           continue;
3842         }
3843
3844         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3845
3846         if (I < NumPacked)
3847           SrcOp.setReg(PackedRegs[I]);
3848         else
3849           SrcOp.setReg(AMDGPU::NoRegister);
3850       }
3851     }
3852   } else {
3853     // If the register allocator cannot place the address registers contiguously
3854     // without introducing moves, then using the non-sequential address encoding
3855     // is always preferable, since it saves VALU instructions and is usually a
3856     // wash in terms of code size or even better.
3857     //
3858     // However, we currently have no way of hinting to the register allocator
3859     // that MIMG addresses should be placed contiguously when it is possible to
3860     // do so, so force non-NSA for the common 2-address case as a heuristic.
3861     //
3862     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3863     // allocation when possible.
3864     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3865
3866     if (!UseNSA && NumVAddrs > 1)
3867       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3868   }
3869
3870   int Flags = 0;
3871   if (IsA16)
3872     Flags |= 1;
3873   if (IsG16)
3874     Flags |= 2;
3875   MI.addOperand(MachineOperand::CreateImm(Flags));
3876
3877   if (BaseOpcode->Store) { // No TFE for stores?
3878     // TODO: Handle dmask trim
3879     Register VData = MI.getOperand(1).getReg();
3880     LLT Ty = MRI->getType(VData);
3881     if (!Ty.isVector() || Ty.getElementType() != S16)
3882       return true;
3883
3884     Register RepackedReg = handleD16VData(B, *MRI, VData);
3885     if (RepackedReg != VData) {
3886       MI.getOperand(1).setReg(RepackedReg);
3887     }
3888
3889     return true;
3890   }
3891
3892   Register DstReg = MI.getOperand(0).getReg();
3893   LLT Ty = MRI->getType(DstReg);
3894   const LLT EltTy = Ty.getScalarType();
3895   const bool IsD16 = Ty.getScalarType() == S16;
3896   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3897
3898   // Confirm that the return type is large enough for the dmask specified
3899   if (NumElts < DMaskLanes)
3900     return false;
3901
3902   if (NumElts > 4 || DMaskLanes > 4)
3903     return false;
3904
3905   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3906   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3907
3908   // The raw dword aligned data component of the load. The only legal cases
3909   // where this matters should be when using the packed D16 format, for
3910   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3911   LLT RoundedTy;
3912
3913   // S32 vector to to cover all data, plus TFE result element.
3914   LLT TFETy;
3915
3916   // Register type to use for each loaded component. Will be S32 or V2S16.
3917   LLT RegTy;
3918
3919   if (IsD16 && ST.hasUnpackedD16VMem()) {
3920     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3921     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3922     RegTy = S32;
3923   } else {
3924     unsigned EltSize = EltTy.getSizeInBits();
3925     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3926     unsigned RoundedSize = 32 * RoundedElts;
3927     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3928     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3929     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3930   }
3931
3932   // The return type does not need adjustment.
3933   // TODO: Should we change s16 case to s32 or <2 x s16>?
3934   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3935     return true;
3936
3937   Register Dst1Reg;
3938
3939   // Insert after the instruction.
3940   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3941
3942   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3943   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3944   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3945   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3946
3947   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3948
3949   MI.getOperand(0).setReg(NewResultReg);
3950
3951   // In the IR, TFE is supposed to be used with a 2 element struct return
3952   // type. The intruction really returns these two values in one contiguous
3953   // register, with one additional dword beyond the loaded data. Rewrite the
3954   // return type to use a single register result.
3955
3956   if (IsTFE) {
3957     Dst1Reg = MI.getOperand(1).getReg();
3958     if (MRI->getType(Dst1Reg) != S32)
3959       return false;
3960
3961     // TODO: Make sure the TFE operand bit is set.
3962     MI.RemoveOperand(1);
3963
3964     // Handle the easy case that requires no repack instructions.
3965     if (Ty == S32) {
3966       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3967       return true;
3968     }
3969   }
3970
3971   // Now figure out how to copy the new result register back into the old
3972   // result.
3973   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3974
3975   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3976
3977   if (ResultNumRegs == 1) {
3978     assert(!IsTFE);
3979     ResultRegs[0] = NewResultReg;
3980   } else {
3981     // We have to repack into a new vector of some kind.
3982     for (int I = 0; I != NumDataRegs; ++I)
3983       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3984     B.buildUnmerge(ResultRegs, NewResultReg);
3985
3986     // Drop the final TFE element to get the data part. The TFE result is
3987     // directly written to the right place already.
3988     if (IsTFE)
3989       ResultRegs.resize(NumDataRegs);
3990   }
3991
3992   // For an s16 scalar result, we form an s32 result with a truncate regardless
3993   // of packed vs. unpacked.
3994   if (IsD16 && !Ty.isVector()) {
3995     B.buildTrunc(DstReg, ResultRegs[0]);
3996     return true;
3997   }
3998
3999   // Avoid a build/concat_vector of 1 entry.
4000   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4001     B.buildBitcast(DstReg, ResultRegs[0]);
4002     return true;
4003   }
4004
4005   assert(Ty.isVector());
4006
4007   if (IsD16) {
4008     // For packed D16 results with TFE enabled, all the data components are
4009     // S32. Cast back to the expected type.
4010     //
4011     // TODO: We don't really need to use load s32 elements. We would only need one
4012     // cast for the TFE result if a multiple of v2s16 was used.
4013     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4014       for (Register &Reg : ResultRegs)
4015         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4016     } else if (ST.hasUnpackedD16VMem()) {
4017       for (Register &Reg : ResultRegs)
4018         Reg = B.buildTrunc(S16, Reg).getReg(0);
4019     }
4020   }
4021
4022   auto padWithUndef = [&](LLT Ty, int NumElts) {
4023     if (NumElts == 0)
4024       return;
4025     Register Undef = B.buildUndef(Ty).getReg(0);
4026     for (int I = 0; I != NumElts; ++I)
4027       ResultRegs.push_back(Undef);
4028   };
4029
4030   // Pad out any elements eliminated due to the dmask.
4031   LLT ResTy = MRI->getType(ResultRegs[0]);
4032   if (!ResTy.isVector()) {
4033     padWithUndef(ResTy, NumElts - ResultRegs.size());
4034     B.buildBuildVector(DstReg, ResultRegs);
4035     return true;
4036   }
4037
4038   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4039   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4040
4041   // Deal with the one annoying legal case.
4042   const LLT V3S16 = LLT::vector(3, 16);
4043   if (Ty == V3S16) {
4044     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4045     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4046     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4047     return true;
4048   }
4049
4050   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4051   B.buildConcatVectors(DstReg, ResultRegs);
4052   return true;
4053 }
4054
4055 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4056   MachineInstr &MI, MachineIRBuilder &B,
4057   GISelChangeObserver &Observer) const {
4058   Register Dst = MI.getOperand(0).getReg();
4059   LLT Ty = B.getMRI()->getType(Dst);
4060   unsigned Size = Ty.getSizeInBits();
4061   MachineFunction &MF = B.getMF();
4062
4063   Observer.changingInstr(MI);
4064
4065   // FIXME: We don't really need this intermediate instruction. The intrinsic
4066   // should be fixed to have a memory operand. Since it's readnone, we're not
4067   // allowed to add one.
4068   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4069   MI.RemoveOperand(1); // Remove intrinsic ID
4070
4071   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4072   // TODO: Should this use datalayout alignment?
4073   const unsigned MemSize = (Size + 7) / 8;
4074   const Align MemAlign(4);
4075   MachineMemOperand *MMO = MF.getMachineMemOperand(
4076       MachinePointerInfo(),
4077       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4078           MachineMemOperand::MOInvariant,
4079       MemSize, MemAlign);
4080   MI.addMemOperand(MF, MMO);
4081
4082   // There are no 96-bit result scalar loads, but widening to 128-bit should
4083   // always be legal. We may need to restore this to a 96-bit result if it turns
4084   // out this needs to be converted to a vector load during RegBankSelect.
4085   if (!isPowerOf2_32(Size)) {
4086     LegalizerHelper Helper(MF, *this, Observer, B);
4087
4088     if (Ty.isVector())
4089       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4090     else
4091       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4092   }
4093
4094   Observer.changedInstr(MI);
4095   return true;
4096 }
4097
4098 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4099                                                 MachineRegisterInfo &MRI,
4100                                                 MachineIRBuilder &B) const {
4101   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4102   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4103       !ST.isTrapHandlerEnabled()) {
4104     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4105   } else {
4106     // Pass queue pointer to trap handler as input, and insert trap instruction
4107     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4108     const ArgDescriptor *Arg =
4109         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4110     if (!Arg)
4111       return false;
4112     MachineRegisterInfo &MRI = *B.getMRI();
4113     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4114     Register LiveIn = getLiveInRegister(
4115         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4116         /*InsertLiveInCopy=*/false);
4117     if (!loadInputValue(LiveIn, B, Arg))
4118       return false;
4119     B.buildCopy(SGPR01, LiveIn);
4120     B.buildInstr(AMDGPU::S_TRAP)
4121         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4122         .addReg(SGPR01, RegState::Implicit);
4123   }
4124
4125   MI.eraseFromParent();
4126   return true;
4127 }
4128
4129 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4130     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4131   // Is non-HSA path or trap-handler disabled? then, report a warning
4132   // accordingly
4133   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4134       !ST.isTrapHandlerEnabled()) {
4135     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4136                                      "debugtrap handler not supported",
4137                                      MI.getDebugLoc(), DS_Warning);
4138     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4139     Ctx.diagnose(NoTrap);
4140   } else {
4141     // Insert debug-trap instruction
4142     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4143   }
4144
4145   MI.eraseFromParent();
4146   return true;
4147 }
4148
4149 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4150                                             MachineInstr &MI) const {
4151   MachineIRBuilder &B = Helper.MIRBuilder;
4152   MachineRegisterInfo &MRI = *B.getMRI();
4153
4154   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4155   auto IntrID = MI.getIntrinsicID();
4156   switch (IntrID) {
4157   case Intrinsic::amdgcn_if:
4158   case Intrinsic::amdgcn_else: {
4159     MachineInstr *Br = nullptr;
4160     MachineBasicBlock *UncondBrTarget = nullptr;
4161     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4162       const SIRegisterInfo *TRI
4163         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4164
4165       Register Def = MI.getOperand(1).getReg();
4166       Register Use = MI.getOperand(3).getReg();
4167
4168       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4169       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4170       if (IntrID == Intrinsic::amdgcn_if) {
4171         B.buildInstr(AMDGPU::SI_IF)
4172           .addDef(Def)
4173           .addUse(Use)
4174           .addMBB(UncondBrTarget);
4175       } else {
4176         B.buildInstr(AMDGPU::SI_ELSE)
4177           .addDef(Def)
4178           .addUse(Use)
4179           .addMBB(UncondBrTarget)
4180           .addImm(0);
4181       }
4182
4183       if (Br) {
4184         Br->getOperand(0).setMBB(CondBrTarget);
4185       } else {
4186         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4187         // since we're swapping branch targets it needs to be reinserted.
4188         // FIXME: IRTranslator should probably not do this
4189         B.buildBr(*CondBrTarget);
4190       }
4191
4192       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4193       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4194       MI.eraseFromParent();
4195       BrCond->eraseFromParent();
4196       return true;
4197     }
4198
4199     return false;
4200   }
4201   case Intrinsic::amdgcn_loop: {
4202     MachineInstr *Br = nullptr;
4203     MachineBasicBlock *UncondBrTarget = nullptr;
4204     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4205       const SIRegisterInfo *TRI
4206         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4207
4208       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4209       Register Reg = MI.getOperand(2).getReg();
4210
4211       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4212       B.buildInstr(AMDGPU::SI_LOOP)
4213         .addUse(Reg)
4214         .addMBB(UncondBrTarget);
4215
4216       if (Br)
4217         Br->getOperand(0).setMBB(CondBrTarget);
4218       else
4219         B.buildBr(*CondBrTarget);
4220
4221       MI.eraseFromParent();
4222       BrCond->eraseFromParent();
4223       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4224       return true;
4225     }
4226
4227     return false;
4228   }
4229   case Intrinsic::amdgcn_kernarg_segment_ptr:
4230     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4231       // This only makes sense to call in a kernel, so just lower to null.
4232       B.buildConstant(MI.getOperand(0).getReg(), 0);
4233       MI.eraseFromParent();
4234       return true;
4235     }
4236
4237     return legalizePreloadedArgIntrin(
4238       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4239   case Intrinsic::amdgcn_implicitarg_ptr:
4240     return legalizeImplicitArgPtr(MI, MRI, B);
4241   case Intrinsic::amdgcn_workitem_id_x:
4242     return legalizePreloadedArgIntrin(MI, MRI, B,
4243                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4244   case Intrinsic::amdgcn_workitem_id_y:
4245     return legalizePreloadedArgIntrin(MI, MRI, B,
4246                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4247   case Intrinsic::amdgcn_workitem_id_z:
4248     return legalizePreloadedArgIntrin(MI, MRI, B,
4249                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4250   case Intrinsic::amdgcn_workgroup_id_x:
4251     return legalizePreloadedArgIntrin(MI, MRI, B,
4252                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4253   case Intrinsic::amdgcn_workgroup_id_y:
4254     return legalizePreloadedArgIntrin(MI, MRI, B,
4255                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4256   case Intrinsic::amdgcn_workgroup_id_z:
4257     return legalizePreloadedArgIntrin(MI, MRI, B,
4258                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4259   case Intrinsic::amdgcn_dispatch_ptr:
4260     return legalizePreloadedArgIntrin(MI, MRI, B,
4261                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4262   case Intrinsic::amdgcn_queue_ptr:
4263     return legalizePreloadedArgIntrin(MI, MRI, B,
4264                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4265   case Intrinsic::amdgcn_implicit_buffer_ptr:
4266     return legalizePreloadedArgIntrin(
4267       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4268   case Intrinsic::amdgcn_dispatch_id:
4269     return legalizePreloadedArgIntrin(MI, MRI, B,
4270                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4271   case Intrinsic::amdgcn_fdiv_fast:
4272     return legalizeFDIVFastIntrin(MI, MRI, B);
4273   case Intrinsic::amdgcn_is_shared:
4274     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4275   case Intrinsic::amdgcn_is_private:
4276     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4277   case Intrinsic::amdgcn_wavefrontsize: {
4278     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4279     MI.eraseFromParent();
4280     return true;
4281   }
4282   case Intrinsic::amdgcn_s_buffer_load:
4283     return legalizeSBufferLoad(MI, B, Helper.Observer);
4284   case Intrinsic::amdgcn_raw_buffer_store:
4285   case Intrinsic::amdgcn_struct_buffer_store:
4286     return legalizeBufferStore(MI, MRI, B, false, false);
4287   case Intrinsic::amdgcn_raw_buffer_store_format:
4288   case Intrinsic::amdgcn_struct_buffer_store_format:
4289     return legalizeBufferStore(MI, MRI, B, false, true);
4290   case Intrinsic::amdgcn_raw_tbuffer_store:
4291   case Intrinsic::amdgcn_struct_tbuffer_store:
4292     return legalizeBufferStore(MI, MRI, B, true, true);
4293   case Intrinsic::amdgcn_raw_buffer_load:
4294   case Intrinsic::amdgcn_struct_buffer_load:
4295     return legalizeBufferLoad(MI, MRI, B, false, false);
4296   case Intrinsic::amdgcn_raw_buffer_load_format:
4297   case Intrinsic::amdgcn_struct_buffer_load_format:
4298     return legalizeBufferLoad(MI, MRI, B, true, false);
4299   case Intrinsic::amdgcn_raw_tbuffer_load:
4300   case Intrinsic::amdgcn_struct_tbuffer_load:
4301     return legalizeBufferLoad(MI, MRI, B, true, true);
4302   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4303   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4304   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4305   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4306   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4307   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4308   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4309   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4310   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4311   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4312   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4313   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4314   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4315   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4316   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4317   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4318   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4319   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4320   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4321   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4322   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4323   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4324   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4325   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4326   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4327   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4328     return legalizeBufferAtomic(MI, B, IntrID);
4329   case Intrinsic::amdgcn_atomic_inc:
4330     return legalizeAtomicIncDec(MI, B, true);
4331   case Intrinsic::amdgcn_atomic_dec:
4332     return legalizeAtomicIncDec(MI, B, false);
4333   case Intrinsic::trap:
4334     return legalizeTrapIntrinsic(MI, MRI, B);
4335   case Intrinsic::debugtrap:
4336     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4337   default: {
4338     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4339             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4340       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4341     return true;
4342   }
4343   }
4344
4345   return true;
4346 }