contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

   1 //===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief SI Implementation of TargetInstrInfo.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "SIInstrInfo.h"
  16 #include "AMDGPUTargetMachine.h"
  17 #include "GCNHazardRecognizer.h"
  18 #include "SIDefines.h"
  19 #include "SIMachineFunctionInfo.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/ScheduleDAG.h"
  24 #include "llvm/IR/DiagnosticInfo.h"
  25 #include "llvm/IR/Function.h"
  26 #include "llvm/CodeGen/RegisterScavenging.h"
  27 #include "llvm/MC/MCInstrDesc.h"
  28 #include "llvm/Support/Debug.h"
  29
  30 using namespace llvm;
  31
  32 // Must be at least 4 to be able to branch over minimum unconditional branch
  33 // code. This is only for making it possible to write reasonably small tests for
  34 // long branches.
  35 static cl::opt<unsigned>
  36 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
  37                  cl::desc("Restrict range of branch instructions (DEBUG)"));
  38
  39 SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
  40   : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
  41
  42 //===----------------------------------------------------------------------===//
  43 // TargetInstrInfo callbacks
  44 //===----------------------------------------------------------------------===//
  45
  46 static unsigned getNumOperandsNoGlue(SDNode *Node) {
  47   unsigned N = Node->getNumOperands();
  48   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
  49     --N;
  50   return N;
  51 }
  52
  53 static SDValue findChainOperand(SDNode *Load) {
  54   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
  55   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
  56   return LastOp;
  57 }
  58
  59 /// \brief Returns true if both nodes have the same value for the given
  60 ///        operand \p Op, or if both nodes do not have this operand.
  61 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
  62   unsigned Opc0 = N0->getMachineOpcode();
  63   unsigned Opc1 = N1->getMachineOpcode();
  64
  65   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
  66   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
  67
  68   if (Op0Idx == -1 && Op1Idx == -1)
  69     return true;
  70
  71
  72   if ((Op0Idx == -1 && Op1Idx != -1) ||
  73       (Op1Idx == -1 && Op0Idx != -1))
  74     return false;
  75
  76   // getNamedOperandIdx returns the index for the MachineInstr's operands,
  77   // which includes the result as the first operand. We are indexing into the
  78   // MachineSDNode's operands, so we need to skip the result operand to get
  79   // the real index.
  80   --Op0Idx;
  81   --Op1Idx;
  82
  83   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
  84 }
  85
  86 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
  87                                                     AliasAnalysis *AA) const {
  88   // TODO: The generic check fails for VALU instructions that should be
  89   // rematerializable due to implicit reads of exec. We really want all of the
  90   // generic logic for this except for this.
  91   switch (MI.getOpcode()) {
  92   case AMDGPU::V_MOV_B32_e32:
  93   case AMDGPU::V_MOV_B32_e64:
  94   case AMDGPU::V_MOV_B64_PSEUDO:
  95     return true;
  96   default:
  97     return false;
  98   }
  99 }
 100
 101 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
 102                                           int64_t &Offset0,
 103                                           int64_t &Offset1) const {
 104   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
 105     return false;
 106
 107   unsigned Opc0 = Load0->getMachineOpcode();
 108   unsigned Opc1 = Load1->getMachineOpcode();
 109
 110   // Make sure both are actually loads.
 111   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
 112     return false;
 113
 114   if (isDS(Opc0) && isDS(Opc1)) {
 115
 116     // FIXME: Handle this case:
 117     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
 118       return false;
 119
 120     // Check base reg.
 121     if (Load0->getOperand(1) != Load1->getOperand(1))
 122       return false;
 123
 124     // Check chain.
 125     if (findChainOperand(Load0) != findChainOperand(Load1))
 126       return false;
 127
 128     // Skip read2 / write2 variants for simplicity.
 129     // TODO: We should report true if the used offsets are adjacent (excluded
 130     // st64 versions).
 131     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
 132         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
 133       return false;
 134
 135     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
 136     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
 137     return true;
 138   }
 139
 140   if (isSMRD(Opc0) && isSMRD(Opc1)) {
 141     // Skip time and cache invalidation instructions.
 142     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
 143         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
 144       return false;
 145
 146     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
 147
 148     // Check base reg.
 149     if (Load0->getOperand(0) != Load1->getOperand(0))
 150       return false;
 151
 152     const ConstantSDNode *Load0Offset =
 153         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
 154     const ConstantSDNode *Load1Offset =
 155         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
 156
 157     if (!Load0Offset || !Load1Offset)
 158       return false;
 159
 160     // Check chain.
 161     if (findChainOperand(Load0) != findChainOperand(Load1))
 162       return false;
 163
 164     Offset0 = Load0Offset->getZExtValue();
 165     Offset1 = Load1Offset->getZExtValue();
 166     return true;
 167   }
 168
 169   // MUBUF and MTBUF can access the same addresses.
 170   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
 171
 172     // MUBUF and MTBUF have vaddr at different indices.
 173     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
 174         findChainOperand(Load0) != findChainOperand(Load1) ||
 175         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
 176         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
 177       return false;
 178
 179     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
 180     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
 181
 182     if (OffIdx0 == -1 || OffIdx1 == -1)
 183       return false;
 184
 185     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
 186     // inlcude the output in the operand list, but SDNodes don't, we need to
 187     // subtract the index by one.
 188     --OffIdx0;
 189     --OffIdx1;
 190
 191     SDValue Off0 = Load0->getOperand(OffIdx0);
 192     SDValue Off1 = Load1->getOperand(OffIdx1);
 193
 194     // The offset might be a FrameIndexSDNode.
 195     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
 196       return false;
 197
 198     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
 199     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
 200     return true;
 201   }
 202
 203   return false;
 204 }
 205
 206 static bool isStride64(unsigned Opc) {
 207   switch (Opc) {
 208   case AMDGPU::DS_READ2ST64_B32:
 209   case AMDGPU::DS_READ2ST64_B64:
 210   case AMDGPU::DS_WRITE2ST64_B32:
 211   case AMDGPU::DS_WRITE2ST64_B64:
 212     return true;
 213   default:
 214     return false;
 215   }
 216 }
 217
 218 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
 219                                         int64_t &Offset,
 220                                         const TargetRegisterInfo *TRI) const {
 221   unsigned Opc = LdSt.getOpcode();
 222
 223   if (isDS(LdSt)) {
 224     const MachineOperand *OffsetImm =
 225         getNamedOperand(LdSt, AMDGPU::OpName::offset);
 226     if (OffsetImm) {
 227       // Normal, single offset LDS instruction.
 228       const MachineOperand *AddrReg =
 229           getNamedOperand(LdSt, AMDGPU::OpName::addr);
 230
 231       BaseReg = AddrReg->getReg();
 232       Offset = OffsetImm->getImm();
 233       return true;
 234     }
 235
 236     // The 2 offset instructions use offset0 and offset1 instead. We can treat
 237     // these as a load with a single offset if the 2 offsets are consecutive. We
 238     // will use this for some partially aligned loads.
 239     const MachineOperand *Offset0Imm =
 240         getNamedOperand(LdSt, AMDGPU::OpName::offset0);
 241     const MachineOperand *Offset1Imm =
 242         getNamedOperand(LdSt, AMDGPU::OpName::offset1);
 243
 244     uint8_t Offset0 = Offset0Imm->getImm();
 245     uint8_t Offset1 = Offset1Imm->getImm();
 246
 247     if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
 248       // Each of these offsets is in element sized units, so we need to convert
 249       // to bytes of the individual reads.
 250
 251       unsigned EltSize;
 252       if (LdSt.mayLoad())
 253         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
 254       else {
 255         assert(LdSt.mayStore());
 256         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
 257         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
 258       }
 259
 260       if (isStride64(Opc))
 261         EltSize *= 64;
 262
 263       const MachineOperand *AddrReg =
 264           getNamedOperand(LdSt, AMDGPU::OpName::addr);
 265       BaseReg = AddrReg->getReg();
 266       Offset = EltSize * Offset0;
 267       return true;
 268     }
 269
 270     return false;
 271   }
 272
 273   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
 274     const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
 275     if (SOffset && SOffset->isReg())
 276       return false;
 277
 278     const MachineOperand *AddrReg =
 279         getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
 280     if (!AddrReg)
 281       return false;
 282
 283     const MachineOperand *OffsetImm =
 284         getNamedOperand(LdSt, AMDGPU::OpName::offset);
 285     BaseReg = AddrReg->getReg();
 286     Offset = OffsetImm->getImm();
 287
 288     if (SOffset) // soffset can be an inline immediate.
 289       Offset += SOffset->getImm();
 290
 291     return true;
 292   }
 293
 294   if (isSMRD(LdSt)) {
 295     const MachineOperand *OffsetImm =
 296         getNamedOperand(LdSt, AMDGPU::OpName::offset);
 297     if (!OffsetImm)
 298       return false;
 299
 300     const MachineOperand *SBaseReg =
 301         getNamedOperand(LdSt, AMDGPU::OpName::sbase);
 302     BaseReg = SBaseReg->getReg();
 303     Offset = OffsetImm->getImm();
 304     return true;
 305   }
 306
 307   if (isFLAT(LdSt)) {
 308     const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
 309     BaseReg = AddrReg->getReg();
 310     Offset = 0;
 311     return true;
 312   }
 313
 314   return false;
 315 }
 316
 317 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
 318                                       MachineInstr &SecondLdSt,
 319                                       unsigned NumLoads) const {
 320   const MachineOperand *FirstDst = nullptr;
 321   const MachineOperand *SecondDst = nullptr;
 322
 323   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
 324       (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
 325       (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
 326     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
 327     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
 328   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
 329     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
 330     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
 331   } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
 332     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
 333     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
 334   }
 335
 336   if (!FirstDst || !SecondDst)
 337     return false;
 338
 339   // Try to limit clustering based on the total number of bytes loaded
 340   // rather than the number of instructions.  This is done to help reduce
 341   // register pressure.  The method used is somewhat inexact, though,
 342   // because it assumes that all loads in the cluster will load the
 343   // same number of bytes as FirstLdSt.
 344
 345   // The unit of this value is bytes.
 346   // FIXME: This needs finer tuning.
 347   unsigned LoadClusterThreshold = 16;
 348
 349   const MachineRegisterInfo &MRI =
 350       FirstLdSt.getParent()->getParent()->getRegInfo();
 351   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
 352
 353   return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
 354 }
 355
 356 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
 357                               MachineBasicBlock::iterator MI,
 358                               const DebugLoc &DL, unsigned DestReg,
 359                               unsigned SrcReg, bool KillSrc) {
 360   MachineFunction *MF = MBB.getParent();
 361   DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(),
 362                                         "illegal SGPR to VGPR copy",
 363                                         DL, DS_Error);
 364   LLVMContext &C = MF->getFunction()->getContext();
 365   C.diagnose(IllegalCopy);
 366
 367   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
 368     .addReg(SrcReg, getKillRegState(KillSrc));
 369 }
 370
 371 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 372                               MachineBasicBlock::iterator MI,
 373                               const DebugLoc &DL, unsigned DestReg,
 374                               unsigned SrcReg, bool KillSrc) const {
 375   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
 376
 377   if (RC == &AMDGPU::VGPR_32RegClass) {
 378     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
 379            AMDGPU::SReg_32RegClass.contains(SrcReg));
 380     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
 381       .addReg(SrcReg, getKillRegState(KillSrc));
 382     return;
 383   }
 384
 385   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
 386       RC == &AMDGPU::SReg_32RegClass) {
 387     if (SrcReg == AMDGPU::SCC) {
 388       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
 389           .addImm(-1)
 390           .addImm(0);
 391       return;
 392     }
 393
 394     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
 395       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
 396       return;
 397     }
 398
 399     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
 400             .addReg(SrcReg, getKillRegState(KillSrc));
 401     return;
 402   }
 403
 404   if (RC == &AMDGPU::SReg_64RegClass) {
 405     if (DestReg == AMDGPU::VCC) {
 406       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
 407         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
 408           .addReg(SrcReg, getKillRegState(KillSrc));
 409       } else {
 410         // FIXME: Hack until VReg_1 removed.
 411         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
 412         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
 413           .addImm(0)
 414           .addReg(SrcReg, getKillRegState(KillSrc));
 415       }
 416
 417       return;
 418     }
 419
 420     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
 421       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
 422       return;
 423     }
 424
 425     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
 426             .addReg(SrcReg, getKillRegState(KillSrc));
 427     return;
 428   }
 429
 430   if (DestReg == AMDGPU::SCC) {
 431     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
 432     BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
 433       .addReg(SrcReg, getKillRegState(KillSrc))
 434       .addImm(0);
 435     return;
 436   }
 437
 438   unsigned EltSize = 4;
 439   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
 440   if (RI.isSGPRClass(RC)) {
 441     if (RI.getRegSizeInBits(*RC) > 32) {
 442       Opcode =  AMDGPU::S_MOV_B64;
 443       EltSize = 8;
 444     } else {
 445       Opcode = AMDGPU::S_MOV_B32;
 446       EltSize = 4;
 447     }
 448
 449     if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
 450       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
 451       return;
 452     }
 453   }
 454
 455
 456   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
 457   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
 458
 459   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
 460     unsigned SubIdx;
 461     if (Forward)
 462       SubIdx = SubIndices[Idx];
 463     else
 464       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
 465
 466     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
 467       get(Opcode), RI.getSubReg(DestReg, SubIdx));
 468
 469     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
 470
 471     if (Idx == SubIndices.size() - 1)
 472       Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
 473
 474     if (Idx == 0)
 475       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
 476
 477     Builder.addReg(SrcReg, RegState::Implicit);
 478   }
 479 }
 480
 481 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
 482   int NewOpc;
 483
 484   // Try to map original to commuted opcode
 485   NewOpc = AMDGPU::getCommuteRev(Opcode);
 486   if (NewOpc != -1)
 487     // Check if the commuted (REV) opcode exists on the target.
 488     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
 489
 490   // Try to map commuted to original opcode
 491   NewOpc = AMDGPU::getCommuteOrig(Opcode);
 492   if (NewOpc != -1)
 493     // Check if the original (non-REV) opcode exists on the target.
 494     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
 495
 496   return Opcode;
 497 }
 498
 499 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 500
 501   if (RI.getRegSizeInBits(*DstRC) == 32) {
 502     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
 503   } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
 504     return AMDGPU::S_MOV_B64;
 505   } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
 506     return  AMDGPU::V_MOV_B64_PSEUDO;
 507   }
 508   return AMDGPU::COPY;
 509 }
 510
 511 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
 512   switch (Size) {
 513   case 4:
 514     return AMDGPU::SI_SPILL_S32_SAVE;
 515   case 8:
 516     return AMDGPU::SI_SPILL_S64_SAVE;
 517   case 16:
 518     return AMDGPU::SI_SPILL_S128_SAVE;
 519   case 32:
 520     return AMDGPU::SI_SPILL_S256_SAVE;
 521   case 64:
 522     return AMDGPU::SI_SPILL_S512_SAVE;
 523   default:
 524     llvm_unreachable("unknown register size");
 525   }
 526 }
 527
 528 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
 529   switch (Size) {
 530   case 4:
 531     return AMDGPU::SI_SPILL_V32_SAVE;
 532   case 8:
 533     return AMDGPU::SI_SPILL_V64_SAVE;
 534   case 12:
 535     return AMDGPU::SI_SPILL_V96_SAVE;
 536   case 16:
 537     return AMDGPU::SI_SPILL_V128_SAVE;
 538   case 32:
 539     return AMDGPU::SI_SPILL_V256_SAVE;
 540   case 64:
 541     return AMDGPU::SI_SPILL_V512_SAVE;
 542   default:
 543     llvm_unreachable("unknown register size");
 544   }
 545 }
 546
 547 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 548                                       MachineBasicBlock::iterator MI,
 549                                       unsigned SrcReg, bool isKill,
 550                                       int FrameIndex,
 551                                       const TargetRegisterClass *RC,
 552                                       const TargetRegisterInfo *TRI) const {
 553   MachineFunction *MF = MBB.getParent();
 554   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 555   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
 556   DebugLoc DL = MBB.findDebugLoc(MI);
 557
 558   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
 559   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
 560   MachinePointerInfo PtrInfo
 561     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
 562   MachineMemOperand *MMO
 563     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
 564                                Size, Align);
 565   unsigned SpillSize = TRI->getSpillSize(*RC);
 566
 567   if (RI.isSGPRClass(RC)) {
 568     MFI->setHasSpilledSGPRs();
 569
 570     // We are only allowed to create one new instruction when spilling
 571     // registers, so we need to use pseudo instruction for spilling SGPRs.
 572     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
 573
 574     // The SGPR spill/restore instructions only work on number sgprs, so we need
 575     // to make sure we are using the correct register class.
 576     if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
 577       MachineRegisterInfo &MRI = MF->getRegInfo();
 578       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
 579     }
 580
 581     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
 582       .addReg(SrcReg, getKillRegState(isKill)) // data
 583       .addFrameIndex(FrameIndex)               // addr
 584       .addMemOperand(MMO)
 585       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
 586       .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
 587     // Add the scratch resource registers as implicit uses because we may end up
 588     // needing them, and need to ensure that the reserved registers are
 589     // correctly handled.
 590
 591     if (ST.hasScalarStores()) {
 592       // m0 is used for offset to scalar stores if used to spill.
 593       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
 594     }
 595
 596     return;
 597   }
 598
 599   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
 600     LLVMContext &Ctx = MF->getFunction()->getContext();
 601     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
 602                   " spill register");
 603     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
 604       .addReg(SrcReg);
 605
 606     return;
 607   }
 608
 609   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 610
 611   unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
 612   MFI->setHasSpilledVGPRs();
 613   BuildMI(MBB, MI, DL, get(Opcode))
 614     .addReg(SrcReg, getKillRegState(isKill)) // data
 615     .addFrameIndex(FrameIndex)               // addr
 616     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
 617     .addReg(MFI->getScratchWaveOffsetReg())  // scratch_offset
 618     .addImm(0)                               // offset
 619     .addMemOperand(MMO);
 620 }
 621
 622 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
 623   switch (Size) {
 624   case 4:
 625     return AMDGPU::SI_SPILL_S32_RESTORE;
 626   case 8:
 627     return AMDGPU::SI_SPILL_S64_RESTORE;
 628   case 16:
 629     return AMDGPU::SI_SPILL_S128_RESTORE;
 630   case 32:
 631     return AMDGPU::SI_SPILL_S256_RESTORE;
 632   case 64:
 633     return AMDGPU::SI_SPILL_S512_RESTORE;
 634   default:
 635     llvm_unreachable("unknown register size");
 636   }
 637 }
 638
 639 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
 640   switch (Size) {
 641   case 4:
 642     return AMDGPU::SI_SPILL_V32_RESTORE;
 643   case 8:
 644     return AMDGPU::SI_SPILL_V64_RESTORE;
 645   case 12:
 646     return AMDGPU::SI_SPILL_V96_RESTORE;
 647   case 16:
 648     return AMDGPU::SI_SPILL_V128_RESTORE;
 649   case 32:
 650     return AMDGPU::SI_SPILL_V256_RESTORE;
 651   case 64:
 652     return AMDGPU::SI_SPILL_V512_RESTORE;
 653   default:
 654     llvm_unreachable("unknown register size");
 655   }
 656 }
 657
 658 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 659                                        MachineBasicBlock::iterator MI,
 660                                        unsigned DestReg, int FrameIndex,
 661                                        const TargetRegisterClass *RC,
 662                                        const TargetRegisterInfo *TRI) const {
 663   MachineFunction *MF = MBB.getParent();
 664   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 665   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
 666   DebugLoc DL = MBB.findDebugLoc(MI);
 667   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
 668   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
 669   unsigned SpillSize = TRI->getSpillSize(*RC);
 670
 671   MachinePointerInfo PtrInfo
 672     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
 673
 674   MachineMemOperand *MMO = MF->getMachineMemOperand(
 675     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
 676
 677   if (RI.isSGPRClass(RC)) {
 678     // FIXME: Maybe this should not include a memoperand because it will be
 679     // lowered to non-memory instructions.
 680     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
 681     if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
 682       MachineRegisterInfo &MRI = MF->getRegInfo();
 683       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
 684     }
 685
 686     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
 687       .addFrameIndex(FrameIndex) // addr
 688       .addMemOperand(MMO)
 689       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
 690       .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
 691
 692     if (ST.hasScalarStores()) {
 693       // m0 is used for offset to scalar stores if used to spill.
 694       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
 695     }
 696
 697     return;
 698   }
 699
 700   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
 701     LLVMContext &Ctx = MF->getFunction()->getContext();
 702     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
 703                   " restore register");
 704     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
 705
 706     return;
 707   }
 708
 709   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 710
 711   unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
 712   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
 713     .addFrameIndex(FrameIndex)              // vaddr
 714     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
 715     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
 716     .addImm(0)                              // offset
 717     .addMemOperand(MMO);
 718 }
 719
 720 /// \param @Offset Offset in bytes of the FrameIndex being spilled
 721 unsigned SIInstrInfo::calculateLDSSpillAddress(
 722     MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
 723     unsigned FrameOffset, unsigned Size) const {
 724   MachineFunction *MF = MBB.getParent();
 725   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 726   const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
 727   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 728   DebugLoc DL = MBB.findDebugLoc(MI);
 729   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
 730   unsigned WavefrontSize = ST.getWavefrontSize();
 731
 732   unsigned TIDReg = MFI->getTIDReg();
 733   if (!MFI->hasCalculatedTID()) {
 734     MachineBasicBlock &Entry = MBB.getParent()->front();
 735     MachineBasicBlock::iterator Insert = Entry.front();
 736     DebugLoc DL = Insert->getDebugLoc();
 737
 738     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
 739                                    *MF);
 740     if (TIDReg == AMDGPU::NoRegister)
 741       return TIDReg;
 742
 743     if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
 744         WorkGroupSize > WavefrontSize) {
 745
 746       unsigned TIDIGXReg
 747         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
 748       unsigned TIDIGYReg
 749         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
 750       unsigned TIDIGZReg
 751         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
 752       unsigned InputPtrReg =
 753           TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
 754       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
 755         if (!Entry.isLiveIn(Reg))
 756           Entry.addLiveIn(Reg);
 757       }
 758
 759       RS->enterBasicBlock(Entry);
 760       // FIXME: Can we scavenge an SReg_64 and access the subregs?
 761       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
 762       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
 763       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
 764               .addReg(InputPtrReg)
 765               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
 766       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
 767               .addReg(InputPtrReg)
 768               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
 769
 770       // NGROUPS.X * NGROUPS.Y
 771       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
 772               .addReg(STmp1)
 773               .addReg(STmp0);
 774       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
 775       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
 776               .addReg(STmp1)
 777               .addReg(TIDIGXReg);
 778       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
 779       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
 780               .addReg(STmp0)
 781               .addReg(TIDIGYReg)
 782               .addReg(TIDReg);
 783       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
 784       BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
 785               .addReg(TIDReg)
 786               .addReg(TIDIGZReg);
 787     } else {
 788       // Get the wave id
 789       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
 790               TIDReg)
 791               .addImm(-1)
 792               .addImm(0);
 793
 794       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
 795               TIDReg)
 796               .addImm(-1)
 797               .addReg(TIDReg);
 798     }
 799
 800     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
 801             TIDReg)
 802             .addImm(2)
 803             .addReg(TIDReg);
 804     MFI->setTIDReg(TIDReg);
 805   }
 806
 807   // Add FrameIndex to LDS offset
 808   unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
 809   BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
 810           .addImm(LDSOffset)
 811           .addReg(TIDReg);
 812
 813   return TmpReg;
 814 }
 815
 816 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
 817                                    MachineBasicBlock::iterator MI,
 818                                    int Count) const {
 819   DebugLoc DL = MBB.findDebugLoc(MI);
 820   while (Count > 0) {
 821     int Arg;
 822     if (Count >= 8)
 823       Arg = 7;
 824     else
 825       Arg = Count - 1;
 826     Count -= 8;
 827     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
 828             .addImm(Arg);
 829   }
 830 }
 831
 832 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
 833                              MachineBasicBlock::iterator MI) const {
 834   insertWaitStates(MBB, MI, 1);
 835 }
 836
 837 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
 838   switch (MI.getOpcode()) {
 839   default: return 1; // FIXME: Do wait states equal cycles?
 840
 841   case AMDGPU::S_NOP:
 842     return MI.getOperand(0).getImm() + 1;
 843   }
 844 }
 845
 846 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 847   MachineBasicBlock &MBB = *MI.getParent();
 848   DebugLoc DL = MBB.findDebugLoc(MI);
 849   switch (MI.getOpcode()) {
 850   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
 851   case AMDGPU::S_MOV_B64_term: {
 852     // This is only a terminator to get the correct spill code placement during
 853     // register allocation.
 854     MI.setDesc(get(AMDGPU::S_MOV_B64));
 855     break;
 856   }
 857   case AMDGPU::S_XOR_B64_term: {
 858     // This is only a terminator to get the correct spill code placement during
 859     // register allocation.
 860     MI.setDesc(get(AMDGPU::S_XOR_B64));
 861     break;
 862   }
 863   case AMDGPU::S_ANDN2_B64_term: {
 864     // This is only a terminator to get the correct spill code placement during
 865     // register allocation.
 866     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
 867     break;
 868   }
 869   case AMDGPU::V_MOV_B64_PSEUDO: {
 870     unsigned Dst = MI.getOperand(0).getReg();
 871     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
 872     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
 873
 874     const MachineOperand &SrcOp = MI.getOperand(1);
 875     // FIXME: Will this work for 64-bit floating point immediates?
 876     assert(!SrcOp.isFPImm());
 877     if (SrcOp.isImm()) {
 878       APInt Imm(64, SrcOp.getImm());
 879       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
 880         .addImm(Imm.getLoBits(32).getZExtValue())
 881         .addReg(Dst, RegState::Implicit | RegState::Define);
 882       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
 883         .addImm(Imm.getHiBits(32).getZExtValue())
 884         .addReg(Dst, RegState::Implicit | RegState::Define);
 885     } else {
 886       assert(SrcOp.isReg());
 887       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
 888         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
 889         .addReg(Dst, RegState::Implicit | RegState::Define);
 890       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
 891         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
 892         .addReg(Dst, RegState::Implicit | RegState::Define);
 893     }
 894     MI.eraseFromParent();
 895     break;
 896   }
 897   case AMDGPU::V_MOVRELD_B32_V1:
 898   case AMDGPU::V_MOVRELD_B32_V2:
 899   case AMDGPU::V_MOVRELD_B32_V4:
 900   case AMDGPU::V_MOVRELD_B32_V8:
 901   case AMDGPU::V_MOVRELD_B32_V16: {
 902     const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
 903     unsigned VecReg = MI.getOperand(0).getReg();
 904     bool IsUndef = MI.getOperand(1).isUndef();
 905     unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
 906     assert(VecReg == MI.getOperand(1).getReg());
 907
 908     MachineInstr *MovRel =
 909         BuildMI(MBB, MI, DL, MovRelDesc)
 910             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
 911             .add(MI.getOperand(2))
 912             .addReg(VecReg, RegState::ImplicitDefine)
 913             .addReg(VecReg,
 914                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
 915
 916     const int ImpDefIdx =
 917         MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
 918     const int ImpUseIdx = ImpDefIdx + 1;
 919     MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
 920
 921     MI.eraseFromParent();
 922     break;
 923   }
 924   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
 925     MachineFunction &MF = *MBB.getParent();
 926     unsigned Reg = MI.getOperand(0).getReg();
 927     unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
 928     unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
 929
 930     // Create a bundle so these instructions won't be re-ordered by the
 931     // post-RA scheduler.
 932     MIBundleBuilder Bundler(MBB, MI);
 933     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
 934
 935     // Add 32-bit offset from this instruction to the start of the
 936     // constant data.
 937     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
 938                        .addReg(RegLo)
 939                        .add(MI.getOperand(1)));
 940
 941     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
 942                                   .addReg(RegHi);
 943     if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
 944       MIB.addImm(0);
 945     else
 946       MIB.add(MI.getOperand(2));
 947
 948     Bundler.append(MIB);
 949     llvm::finalizeBundle(MBB, Bundler.begin());
 950
 951     MI.eraseFromParent();
 952     break;
 953   }
 954   }
 955   return true;
 956 }
 957
 958 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
 959                                       MachineOperand &Src0,
 960                                       unsigned Src0OpName,
 961                                       MachineOperand &Src1,
 962                                       unsigned Src1OpName) const {
 963   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
 964   if (!Src0Mods)
 965     return false;
 966
 967   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
 968   assert(Src1Mods &&
 969          "All commutable instructions have both src0 and src1 modifiers");
 970
 971   int Src0ModsVal = Src0Mods->getImm();
 972   int Src1ModsVal = Src1Mods->getImm();
 973
 974   Src1Mods->setImm(Src0ModsVal);
 975   Src0Mods->setImm(Src1ModsVal);
 976   return true;
 977 }
 978
 979 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
 980                                              MachineOperand &RegOp,
 981                                              MachineOperand &NonRegOp) {
 982   unsigned Reg = RegOp.getReg();
 983   unsigned SubReg = RegOp.getSubReg();
 984   bool IsKill = RegOp.isKill();
 985   bool IsDead = RegOp.isDead();
 986   bool IsUndef = RegOp.isUndef();
 987   bool IsDebug = RegOp.isDebug();
 988
 989   if (NonRegOp.isImm())
 990     RegOp.ChangeToImmediate(NonRegOp.getImm());
 991   else if (NonRegOp.isFI())
 992     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
 993   else
 994     return nullptr;
 995
 996   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
 997   NonRegOp.setSubReg(SubReg);
 998
 999   return &MI;
1000 }
1001
1002 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
1003                                                   unsigned Src0Idx,
1004                                                   unsigned Src1Idx) const {
1005   assert(!NewMI && "this should never be used");
1006
1007   unsigned Opc = MI.getOpcode();
1008   int CommutedOpcode = commuteOpcode(Opc);
1009   if (CommutedOpcode == -1)
1010     return nullptr;
1011
1012   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1013            static_cast<int>(Src0Idx) &&
1014          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1015            static_cast<int>(Src1Idx) &&
1016          "inconsistency with findCommutedOpIndices");
1017
1018   MachineOperand &Src0 = MI.getOperand(Src0Idx);
1019   MachineOperand &Src1 = MI.getOperand(Src1Idx);
1020
1021   MachineInstr *CommutedMI = nullptr;
1022   if (Src0.isReg() && Src1.isReg()) {
1023     if (isOperandLegal(MI, Src1Idx, &Src0)) {
1024       // Be sure to copy the source modifiers to the right place.
1025       CommutedMI
1026         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1027     }
1028
1029   } else if (Src0.isReg() && !Src1.isReg()) {
1030     // src0 should always be able to support any operand type, so no need to
1031     // check operand legality.
1032     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1033   } else if (!Src0.isReg() && Src1.isReg()) {
1034     if (isOperandLegal(MI, Src1Idx, &Src0))
1035       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1036   } else {
1037     // FIXME: Found two non registers to commute. This does happen.
1038     return nullptr;
1039   }
1040
1041
1042   if (CommutedMI) {
1043     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1044                         Src1, AMDGPU::OpName::src1_modifiers);
1045
1046     CommutedMI->setDesc(get(CommutedOpcode));
1047   }
1048
1049   return CommutedMI;
1050 }
1051
1052 // This needs to be implemented because the source modifiers may be inserted
1053 // between the true commutable operands, and the base
1054 // TargetInstrInfo::commuteInstruction uses it.
1055 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
1056                                         unsigned &SrcOpIdx1) const {
1057   if (!MI.isCommutable())
1058     return false;
1059
1060   unsigned Opc = MI.getOpcode();
1061   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1062   if (Src0Idx == -1)
1063     return false;
1064
1065   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1066   if (Src1Idx == -1)
1067     return false;
1068
1069   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1070 }
1071
1072 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1073                                         int64_t BrOffset) const {
1074   // BranchRelaxation should never have to check s_setpc_b64 because its dest
1075   // block is unanalyzable.
1076   assert(BranchOp != AMDGPU::S_SETPC_B64);
1077
1078   // Convert to dwords.
1079   BrOffset /= 4;
1080
1081   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1082   // from the next instruction.
1083   BrOffset -= 1;
1084
1085   return isIntN(BranchOffsetBits, BrOffset);
1086 }
1087
1088 MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
1089   const MachineInstr &MI) const {
1090   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1091     // This would be a difficult analysis to perform, but can always be legal so
1092     // there's no need to analyze it.
1093     return nullptr;
1094   }
1095
1096   return MI.getOperand(0).getMBB();
1097 }
1098
1099 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
1100                                            MachineBasicBlock &DestBB,
1101                                            const DebugLoc &DL,
1102                                            int64_t BrOffset,
1103                                            RegScavenger *RS) const {
1104   assert(RS && "RegScavenger required for long branching");
1105   assert(MBB.empty() &&
1106          "new block should be inserted for expanding unconditional branch");
1107   assert(MBB.pred_size() == 1);
1108
1109   MachineFunction *MF = MBB.getParent();
1110   MachineRegisterInfo &MRI = MF->getRegInfo();
1111
1112   // FIXME: Virtual register workaround for RegScavenger not working with empty
1113   // blocks.
1114   unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1115
1116   auto I = MBB.end();
1117
1118   // We need to compute the offset relative to the instruction immediately after
1119   // s_getpc_b64. Insert pc arithmetic code before last terminator.
1120   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1121
1122   // TODO: Handle > 32-bit block address.
1123   if (BrOffset >= 0) {
1124     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1125       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1126       .addReg(PCReg, 0, AMDGPU::sub0)
1127       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
1128     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1129       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1130       .addReg(PCReg, 0, AMDGPU::sub1)
1131       .addImm(0);
1132   } else {
1133     // Backwards branch.
1134     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1135       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1136       .addReg(PCReg, 0, AMDGPU::sub0)
1137       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
1138     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1139       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1140       .addReg(PCReg, 0, AMDGPU::sub1)
1141       .addImm(0);
1142   }
1143
1144   // Insert the indirect branch after the other terminator.
1145   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1146     .addReg(PCReg);
1147
1148   // FIXME: If spilling is necessary, this will fail because this scavenger has
1149   // no emergency stack slots. It is non-trivial to spill in this situation,
1150   // because the restore code needs to be specially placed after the
1151   // jump. BranchRelaxation then needs to be made aware of the newly inserted
1152   // block.
1153   //
1154   // If a spill is needed for the pc register pair, we need to insert a spill
1155   // restore block right before the destination block, and insert a short branch
1156   // into the old destination block's fallthrough predecessor.
1157   // e.g.:
1158   //
1159   // s_cbranch_scc0 skip_long_branch:
1160   //
1161   // long_branch_bb:
1162   //   spill s[8:9]
1163   //   s_getpc_b64 s[8:9]
1164   //   s_add_u32 s8, s8, restore_bb
1165   //   s_addc_u32 s9, s9, 0
1166   //   s_setpc_b64 s[8:9]
1167   //
1168   // skip_long_branch:
1169   //   foo;
1170   //
1171   // .....
1172   //
1173   // dest_bb_fallthrough_predecessor:
1174   // bar;
1175   // s_branch dest_bb
1176   //
1177   // restore_bb:
1178   //  restore s[8:9]
1179   //  fallthrough dest_bb
1180   ///
1181   // dest_bb:
1182   //   buzz;
1183
1184   RS->enterBasicBlockEnd(MBB);
1185   unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
1186                                        MachineBasicBlock::iterator(GetPC), 0);
1187   MRI.replaceRegWith(PCReg, Scav);
1188   MRI.clearVirtRegs();
1189   RS->setRegUsed(Scav);
1190
1191   return 4 + 8 + 4 + 4;
1192 }
1193
1194 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1195   switch (Cond) {
1196   case SIInstrInfo::SCC_TRUE:
1197     return AMDGPU::S_CBRANCH_SCC1;
1198   case SIInstrInfo::SCC_FALSE:
1199     return AMDGPU::S_CBRANCH_SCC0;
1200   case SIInstrInfo::VCCNZ:
1201     return AMDGPU::S_CBRANCH_VCCNZ;
1202   case SIInstrInfo::VCCZ:
1203     return AMDGPU::S_CBRANCH_VCCZ;
1204   case SIInstrInfo::EXECNZ:
1205     return AMDGPU::S_CBRANCH_EXECNZ;
1206   case SIInstrInfo::EXECZ:
1207     return AMDGPU::S_CBRANCH_EXECZ;
1208   default:
1209     llvm_unreachable("invalid branch predicate");
1210   }
1211 }
1212
1213 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1214   switch (Opcode) {
1215   case AMDGPU::S_CBRANCH_SCC0:
1216     return SCC_FALSE;
1217   case AMDGPU::S_CBRANCH_SCC1:
1218     return SCC_TRUE;
1219   case AMDGPU::S_CBRANCH_VCCNZ:
1220     return VCCNZ;
1221   case AMDGPU::S_CBRANCH_VCCZ:
1222     return VCCZ;
1223   case AMDGPU::S_CBRANCH_EXECNZ:
1224     return EXECNZ;
1225   case AMDGPU::S_CBRANCH_EXECZ:
1226     return EXECZ;
1227   default:
1228     return INVALID_BR;
1229   }
1230 }
1231
1232 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
1233                                     MachineBasicBlock::iterator I,
1234                                     MachineBasicBlock *&TBB,
1235                                     MachineBasicBlock *&FBB,
1236                                     SmallVectorImpl<MachineOperand> &Cond,
1237                                     bool AllowModify) const {
1238   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1239     // Unconditional Branch
1240     TBB = I->getOperand(0).getMBB();
1241     return false;
1242   }
1243
1244   BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1245   if (Pred == INVALID_BR)
1246     return true;
1247
1248   MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
1249   Cond.push_back(MachineOperand::CreateImm(Pred));
1250   Cond.push_back(I->getOperand(1)); // Save the branch register.
1251
1252   ++I;
1253
1254   if (I == MBB.end()) {
1255     // Conditional branch followed by fall-through.
1256     TBB = CondBB;
1257     return false;
1258   }
1259
1260   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1261     TBB = CondBB;
1262     FBB = I->getOperand(0).getMBB();
1263     return false;
1264   }
1265
1266   return true;
1267 }
1268
1269 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
1270                                 MachineBasicBlock *&FBB,
1271                                 SmallVectorImpl<MachineOperand> &Cond,
1272                                 bool AllowModify) const {
1273   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1274   if (I == MBB.end())
1275     return false;
1276
1277   if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1278     return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1279
1280   ++I;
1281
1282   // TODO: Should be able to treat as fallthrough?
1283   if (I == MBB.end())
1284     return true;
1285
1286   if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1287     return true;
1288
1289   MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1290
1291   // Specifically handle the case where the conditional branch is to the same
1292   // destination as the mask branch. e.g.
1293   //
1294   // si_mask_branch BB8
1295   // s_cbranch_execz BB8
1296   // s_cbranch BB9
1297   //
1298   // This is required to understand divergent loops which may need the branches
1299   // to be relaxed.
1300   if (TBB != MaskBrDest || Cond.empty())
1301     return true;
1302
1303   auto Pred = Cond[0].getImm();
1304   return (Pred != EXECZ && Pred != EXECNZ);
1305 }
1306
1307 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
1308                                    int *BytesRemoved) const {
1309   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1310
1311   unsigned Count = 0;
1312   unsigned RemovedSize = 0;
1313   while (I != MBB.end()) {
1314     MachineBasicBlock::iterator Next = std::next(I);
1315     if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1316       I = Next;
1317       continue;
1318     }
1319
1320     RemovedSize += getInstSizeInBytes(*I);
1321     I->eraseFromParent();
1322     ++Count;
1323     I = Next;
1324   }
1325
1326   if (BytesRemoved)
1327     *BytesRemoved = RemovedSize;
1328
1329   return Count;
1330 }
1331
1332 // Copy the flags onto the implicit condition register operand.
1333 static void preserveCondRegFlags(MachineOperand &CondReg,
1334                                  const MachineOperand &OrigCond) {
1335   CondReg.setIsUndef(OrigCond.isUndef());
1336   CondReg.setIsKill(OrigCond.isKill());
1337 }
1338
1339 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
1340                                    MachineBasicBlock *TBB,
1341                                    MachineBasicBlock *FBB,
1342                                    ArrayRef<MachineOperand> Cond,
1343                                    const DebugLoc &DL,
1344                                    int *BytesAdded) const {
1345
1346   if (!FBB && Cond.empty()) {
1347     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1348       .addMBB(TBB);
1349     if (BytesAdded)
1350       *BytesAdded = 4;
1351     return 1;
1352   }
1353
1354   assert(TBB && Cond[0].isImm());
1355
1356   unsigned Opcode
1357     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1358
1359   if (!FBB) {
1360     Cond[1].isUndef();
1361     MachineInstr *CondBr =
1362       BuildMI(&MBB, DL, get(Opcode))
1363       .addMBB(TBB);
1364
1365     // Copy the flags onto the implicit condition register operand.
1366     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1367
1368     if (BytesAdded)
1369       *BytesAdded = 4;
1370     return 1;
1371   }
1372
1373   assert(TBB && FBB);
1374
1375   MachineInstr *CondBr =
1376     BuildMI(&MBB, DL, get(Opcode))
1377     .addMBB(TBB);
1378   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1379     .addMBB(FBB);
1380
1381   MachineOperand &CondReg = CondBr->getOperand(1);
1382   CondReg.setIsUndef(Cond[1].isUndef());
1383   CondReg.setIsKill(Cond[1].isKill());
1384
1385   if (BytesAdded)
1386       *BytesAdded = 8;
1387
1388   return 2;
1389 }
1390
1391 bool SIInstrInfo::reverseBranchCondition(
1392   SmallVectorImpl<MachineOperand> &Cond) const {
1393   assert(Cond.size() == 2);
1394   Cond[0].setImm(-Cond[0].getImm());
1395   return false;
1396 }
1397
1398 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
1399                                   ArrayRef<MachineOperand> Cond,
1400                                   unsigned TrueReg, unsigned FalseReg,
1401                                   int &CondCycles,
1402                                   int &TrueCycles, int &FalseCycles) const {
1403   switch (Cond[0].getImm()) {
1404   case VCCNZ:
1405   case VCCZ: {
1406     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1407     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1408     assert(MRI.getRegClass(FalseReg) == RC);
1409
1410     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1411     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1412
1413     // Limit to equal cost for branch vs. N v_cndmask_b32s.
1414     return !RI.isSGPRClass(RC) && NumInsts <= 6;
1415   }
1416   case SCC_TRUE:
1417   case SCC_FALSE: {
1418     // FIXME: We could insert for VGPRs if we could replace the original compare
1419     // with a vector one.
1420     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1421     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1422     assert(MRI.getRegClass(FalseReg) == RC);
1423
1424     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1425
1426     // Multiples of 8 can do s_cselect_b64
1427     if (NumInsts % 2 == 0)
1428       NumInsts /= 2;
1429
1430     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1431     return RI.isSGPRClass(RC);
1432   }
1433   default:
1434     return false;
1435   }
1436 }
1437
1438 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
1439                                MachineBasicBlock::iterator I, const DebugLoc &DL,
1440                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
1441                                unsigned TrueReg, unsigned FalseReg) const {
1442   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1443   if (Pred == VCCZ || Pred == SCC_FALSE) {
1444     Pred = static_cast<BranchPredicate>(-Pred);
1445     std::swap(TrueReg, FalseReg);
1446   }
1447
1448   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1449   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1450   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1451
1452   if (DstSize == 32) {
1453     unsigned SelOp = Pred == SCC_TRUE ?
1454       AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1455
1456     // Instruction's operands are backwards from what is expected.
1457     MachineInstr *Select =
1458       BuildMI(MBB, I, DL, get(SelOp), DstReg)
1459       .addReg(FalseReg)
1460       .addReg(TrueReg);
1461
1462     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1463     return;
1464   }
1465
1466   if (DstSize == 64 && Pred == SCC_TRUE) {
1467     MachineInstr *Select =
1468       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1469       .addReg(FalseReg)
1470       .addReg(TrueReg);
1471
1472     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1473     return;
1474   }
1475
1476   static const int16_t Sub0_15[] = {
1477     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1478     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1479     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1480     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1481   };
1482
1483   static const int16_t Sub0_15_64[] = {
1484     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1485     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1486     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1487     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1488   };
1489
1490   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1491   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1492   const int16_t *SubIndices = Sub0_15;
1493   int NElts = DstSize / 32;
1494
1495   // 64-bit select is only avaialble for SALU.
1496   if (Pred == SCC_TRUE) {
1497     SelOp = AMDGPU::S_CSELECT_B64;
1498     EltRC = &AMDGPU::SGPR_64RegClass;
1499     SubIndices = Sub0_15_64;
1500
1501     assert(NElts % 2 == 0);
1502     NElts /= 2;
1503   }
1504
1505   MachineInstrBuilder MIB = BuildMI(
1506     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1507
1508   I = MIB->getIterator();
1509
1510   SmallVector<unsigned, 8> Regs;
1511   for (int Idx = 0; Idx != NElts; ++Idx) {
1512     unsigned DstElt = MRI.createVirtualRegister(EltRC);
1513     Regs.push_back(DstElt);
1514
1515     unsigned SubIdx = SubIndices[Idx];
1516
1517     MachineInstr *Select =
1518       BuildMI(MBB, I, DL, get(SelOp), DstElt)
1519       .addReg(FalseReg, 0, SubIdx)
1520       .addReg(TrueReg, 0, SubIdx);
1521     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1522
1523     MIB.addReg(DstElt)
1524        .addImm(SubIdx);
1525   }
1526 }
1527
1528 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
1529   switch (MI.getOpcode()) {
1530   case AMDGPU::V_MOV_B32_e32:
1531   case AMDGPU::V_MOV_B32_e64:
1532   case AMDGPU::V_MOV_B64_PSEUDO: {
1533     // If there are additional implicit register operands, this may be used for
1534     // register indexing so the source register operand isn't simply copied.
1535     unsigned NumOps = MI.getDesc().getNumOperands() +
1536       MI.getDesc().getNumImplicitUses();
1537
1538     return MI.getNumOperands() == NumOps;
1539   }
1540   case AMDGPU::S_MOV_B32:
1541   case AMDGPU::S_MOV_B64:
1542   case AMDGPU::COPY:
1543     return true;
1544   default:
1545     return false;
1546   }
1547 }
1548
1549 static void removeModOperands(MachineInstr &MI) {
1550   unsigned Opc = MI.getOpcode();
1551   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1552                                               AMDGPU::OpName::src0_modifiers);
1553   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1554                                               AMDGPU::OpName::src1_modifiers);
1555   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1556                                               AMDGPU::OpName::src2_modifiers);
1557
1558   MI.RemoveOperand(Src2ModIdx);
1559   MI.RemoveOperand(Src1ModIdx);
1560   MI.RemoveOperand(Src0ModIdx);
1561 }
1562
1563 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
1564                                 unsigned Reg, MachineRegisterInfo *MRI) const {
1565   if (!MRI->hasOneNonDBGUse(Reg))
1566     return false;
1567
1568   unsigned Opc = UseMI.getOpcode();
1569   if (Opc == AMDGPU::COPY) {
1570     bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
1571     switch (DefMI.getOpcode()) {
1572     default:
1573       return false;
1574     case AMDGPU::S_MOV_B64:
1575       // TODO: We could fold 64-bit immediates, but this get compilicated
1576       // when there are sub-registers.
1577       return false;
1578
1579     case AMDGPU::V_MOV_B32_e32:
1580     case AMDGPU::S_MOV_B32:
1581       break;
1582     }
1583     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1584     const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
1585     assert(ImmOp);
1586     // FIXME: We could handle FrameIndex values here.
1587     if (!ImmOp->isImm()) {
1588       return false;
1589     }
1590     UseMI.setDesc(get(NewOpc));
1591     UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
1592     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
1593     return true;
1594   }
1595
1596   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
1597       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
1598     // Don't fold if we are using source or output modifiers. The new VOP2
1599     // instructions don't have them.
1600     if (hasAnyModifiersSet(UseMI))
1601       return false;
1602
1603     const MachineOperand &ImmOp = DefMI.getOperand(1);
1604
1605     // If this is a free constant, there's no reason to do this.
1606     // TODO: We could fold this here instead of letting SIFoldOperands do it
1607     // later.
1608     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
1609
1610     // Any src operand can be used for the legality check.
1611     if (isInlineConstant(UseMI, *Src0, ImmOp))
1612       return false;
1613
1614     bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
1615     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
1616     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
1617
1618     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
1619     // We should only expect these to be on src0 due to canonicalizations.
1620     if (Src0->isReg() && Src0->getReg() == Reg) {
1621       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1622         return false;
1623
1624       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
1625         return false;
1626
1627       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
1628
1629       const int64_t Imm = DefMI.getOperand(1).getImm();
1630
1631       // FIXME: This would be a lot easier if we could return a new instruction
1632       // instead of having to modify in place.
1633
1634       // Remove these first since they are at the end.
1635       UseMI.RemoveOperand(
1636           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
1637       UseMI.RemoveOperand(
1638           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
1639
1640       unsigned Src1Reg = Src1->getReg();
1641       unsigned Src1SubReg = Src1->getSubReg();
1642       Src0->setReg(Src1Reg);
1643       Src0->setSubReg(Src1SubReg);
1644       Src0->setIsKill(Src1->isKill());
1645
1646       if (Opc == AMDGPU::V_MAC_F32_e64 ||
1647           Opc == AMDGPU::V_MAC_F16_e64)
1648         UseMI.untieRegOperand(
1649             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1650
1651       Src1->ChangeToImmediate(Imm);
1652
1653       removeModOperands(UseMI);
1654       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
1655
1656       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1657       if (DeleteDef)
1658         DefMI.eraseFromParent();
1659
1660       return true;
1661     }
1662
1663     // Added part is the constant: Use v_madak_{f16, f32}.
1664     if (Src2->isReg() && Src2->getReg() == Reg) {
1665       // Not allowed to use constant bus for another operand.
1666       // We can however allow an inline immediate as src0.
1667       if (!Src0->isImm() &&
1668           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
1669         return false;
1670
1671       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1672         return false;
1673
1674       const int64_t Imm = DefMI.getOperand(1).getImm();
1675
1676       // FIXME: This would be a lot easier if we could return a new instruction
1677       // instead of having to modify in place.
1678
1679       // Remove these first since they are at the end.
1680       UseMI.RemoveOperand(
1681           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
1682       UseMI.RemoveOperand(
1683           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
1684
1685       if (Opc == AMDGPU::V_MAC_F32_e64 ||
1686           Opc == AMDGPU::V_MAC_F16_e64)
1687         UseMI.untieRegOperand(
1688             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1689
1690       // ChangingToImmediate adds Src2 back to the instruction.
1691       Src2->ChangeToImmediate(Imm);
1692
1693       // These come before src2.
1694       removeModOperands(UseMI);
1695       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
1696
1697       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1698       if (DeleteDef)
1699         DefMI.eraseFromParent();
1700
1701       return true;
1702     }
1703   }
1704
1705   return false;
1706 }
1707
1708 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
1709                                 int WidthB, int OffsetB) {
1710   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1711   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1712   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1713   return LowOffset + LowWidth <= HighOffset;
1714 }
1715
1716 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
1717                                                MachineInstr &MIb) const {
1718   unsigned BaseReg0, BaseReg1;
1719   int64_t Offset0, Offset1;
1720
1721   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
1722       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
1723
1724     if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
1725       // FIXME: Handle ds_read2 / ds_write2.
1726       return false;
1727     }
1728     unsigned Width0 = (*MIa.memoperands_begin())->getSize();
1729     unsigned Width1 = (*MIb.memoperands_begin())->getSize();
1730     if (BaseReg0 == BaseReg1 &&
1731         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
1732       return true;
1733     }
1734   }
1735
1736   return false;
1737 }
1738
1739 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
1740                                                   MachineInstr &MIb,
1741                                                   AliasAnalysis *AA) const {
1742   assert((MIa.mayLoad() || MIa.mayStore()) &&
1743          "MIa must load from or modify a memory location");
1744   assert((MIb.mayLoad() || MIb.mayStore()) &&
1745          "MIb must load from or modify a memory location");
1746
1747   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
1748     return false;
1749
1750   // XXX - Can we relax this between address spaces?
1751   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1752     return false;
1753
1754   if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
1755     const MachineMemOperand *MMOa = *MIa.memoperands_begin();
1756     const MachineMemOperand *MMOb = *MIb.memoperands_begin();
1757     if (MMOa->getValue() && MMOb->getValue()) {
1758       MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
1759       MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
1760       if (!AA->alias(LocA, LocB))
1761         return true;
1762     }
1763   }
1764
1765   // TODO: Should we check the address space from the MachineMemOperand? That
1766   // would allow us to distinguish objects we know don't alias based on the
1767   // underlying address space, even if it was lowered to a different one,
1768   // e.g. private accesses lowered to use MUBUF instructions on a scratch
1769   // buffer.
1770   if (isDS(MIa)) {
1771     if (isDS(MIb))
1772       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1773
1774     return !isFLAT(MIb);
1775   }
1776
1777   if (isMUBUF(MIa) || isMTBUF(MIa)) {
1778     if (isMUBUF(MIb) || isMTBUF(MIb))
1779       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1780
1781     return !isFLAT(MIb) && !isSMRD(MIb);
1782   }
1783
1784   if (isSMRD(MIa)) {
1785     if (isSMRD(MIb))
1786       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1787
1788     return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
1789   }
1790
1791   if (isFLAT(MIa)) {
1792     if (isFLAT(MIb))
1793       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1794
1795     return false;
1796   }
1797
1798   return false;
1799 }
1800
1801 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
1802                                                  MachineInstr &MI,
1803                                                  LiveVariables *LV) const {
1804   bool IsF16 = false;
1805
1806   switch (MI.getOpcode()) {
1807   default:
1808     return nullptr;
1809   case AMDGPU::V_MAC_F16_e64:
1810     IsF16 = true;
1811   case AMDGPU::V_MAC_F32_e64:
1812     break;
1813   case AMDGPU::V_MAC_F16_e32:
1814     IsF16 = true;
1815   case AMDGPU::V_MAC_F32_e32: {
1816     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1817                                              AMDGPU::OpName::src0);
1818     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
1819     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
1820       return nullptr;
1821     break;
1822   }
1823   }
1824
1825   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
1826   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
1827   const MachineOperand *Src0Mods =
1828     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
1829   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
1830   const MachineOperand *Src1Mods =
1831     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
1832   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
1833   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
1834   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
1835
1836   return BuildMI(*MBB, MI, MI.getDebugLoc(),
1837                  get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
1838       .add(*Dst)
1839       .addImm(Src0Mods ? Src0Mods->getImm() : 0)
1840       .add(*Src0)
1841       .addImm(Src1Mods ? Src1Mods->getImm() : 0)
1842       .add(*Src1)
1843       .addImm(0) // Src mods
1844       .add(*Src2)
1845       .addImm(Clamp ? Clamp->getImm() : 0)
1846       .addImm(Omod ? Omod->getImm() : 0);
1847 }
1848
1849 // It's not generally safe to move VALU instructions across these since it will
1850 // start using the register as a base index rather than directly.
1851 // XXX - Why isn't hasSideEffects sufficient for these?
1852 static bool changesVGPRIndexingMode(const MachineInstr &MI) {
1853   switch (MI.getOpcode()) {
1854   case AMDGPU::S_SET_GPR_IDX_ON:
1855   case AMDGPU::S_SET_GPR_IDX_MODE:
1856   case AMDGPU::S_SET_GPR_IDX_OFF:
1857     return true;
1858   default:
1859     return false;
1860   }
1861 }
1862
1863 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1864                                        const MachineBasicBlock *MBB,
1865                                        const MachineFunction &MF) const {
1866   // XXX - Do we want the SP check in the base implementation?
1867
1868   // Target-independent instructions do not have an implicit-use of EXEC, even
1869   // when they operate on VGPRs. Treating EXEC modifications as scheduling
1870   // boundaries prevents incorrect movements of such instructions.
1871   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
1872          MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
1873          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
1874          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
1875          changesVGPRIndexingMode(MI);
1876 }
1877
1878 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
1879   switch (Imm.getBitWidth()) {
1880   case 32:
1881     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
1882                                         ST.hasInv2PiInlineImm());
1883   case 64:
1884     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
1885                                         ST.hasInv2PiInlineImm());
1886   case 16:
1887     return ST.has16BitInsts() &&
1888            AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
1889                                         ST.hasInv2PiInlineImm());
1890   default:
1891     llvm_unreachable("invalid bitwidth");
1892   }
1893 }
1894
1895 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
1896                                    uint8_t OperandType) const {
1897   if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
1898     return false;
1899
1900   // MachineOperand provides no way to tell the true operand size, since it only
1901   // records a 64-bit value. We need to know the size to determine if a 32-bit
1902   // floating point immediate bit pattern is legal for an integer immediate. It
1903   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
1904
1905   int64_t Imm = MO.getImm();
1906   switch (OperandType) {
1907   case AMDGPU::OPERAND_REG_IMM_INT32:
1908   case AMDGPU::OPERAND_REG_IMM_FP32:
1909   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
1910   case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
1911     int32_t Trunc = static_cast<int32_t>(Imm);
1912     return Trunc == Imm &&
1913            AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
1914   }
1915   case AMDGPU::OPERAND_REG_IMM_INT64:
1916   case AMDGPU::OPERAND_REG_IMM_FP64:
1917   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
1918   case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
1919     return AMDGPU::isInlinableLiteral64(MO.getImm(),
1920                                         ST.hasInv2PiInlineImm());
1921   }
1922   case AMDGPU::OPERAND_REG_IMM_INT16:
1923   case AMDGPU::OPERAND_REG_IMM_FP16:
1924   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
1925   case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
1926     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
1927       // A few special case instructions have 16-bit operands on subtargets
1928       // where 16-bit instructions are not legal.
1929       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
1930       // constants in these cases
1931       int16_t Trunc = static_cast<int16_t>(Imm);
1932       return ST.has16BitInsts() &&
1933              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
1934     }
1935
1936     return false;
1937   }
1938   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
1939   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
1940     uint32_t Trunc = static_cast<uint32_t>(Imm);
1941     return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
1942   }
1943   default:
1944     llvm_unreachable("invalid bitwidth");
1945   }
1946 }
1947
1948 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
1949                                         const MCOperandInfo &OpInfo) const {
1950   switch (MO.getType()) {
1951   case MachineOperand::MO_Register:
1952     return false;
1953   case MachineOperand::MO_Immediate:
1954     return !isInlineConstant(MO, OpInfo);
1955   case MachineOperand::MO_FrameIndex:
1956   case MachineOperand::MO_MachineBasicBlock:
1957   case MachineOperand::MO_ExternalSymbol:
1958   case MachineOperand::MO_GlobalAddress:
1959   case MachineOperand::MO_MCSymbol:
1960     return true;
1961   default:
1962     llvm_unreachable("unexpected operand type");
1963   }
1964 }
1965
1966 static bool compareMachineOp(const MachineOperand &Op0,
1967                              const MachineOperand &Op1) {
1968   if (Op0.getType() != Op1.getType())
1969     return false;
1970
1971   switch (Op0.getType()) {
1972   case MachineOperand::MO_Register:
1973     return Op0.getReg() == Op1.getReg();
1974   case MachineOperand::MO_Immediate:
1975     return Op0.getImm() == Op1.getImm();
1976   default:
1977     llvm_unreachable("Didn't expect to be comparing these operand types");
1978   }
1979 }
1980
1981 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
1982                                     const MachineOperand &MO) const {
1983   const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
1984
1985   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
1986
1987   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
1988     return true;
1989
1990   if (OpInfo.RegClass < 0)
1991     return false;
1992
1993   if (MO.isImm() && isInlineConstant(MO, OpInfo))
1994     return RI.opCanUseInlineConstant(OpInfo.OperandType);
1995
1996   return RI.opCanUseLiteralConstant(OpInfo.OperandType);
1997 }
1998
1999 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2000   int Op32 = AMDGPU::getVOPe32(Opcode);
2001   if (Op32 == -1)
2002     return false;
2003
2004   return pseudoToMCOpcode(Op32) != -1;
2005 }
2006
2007 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2008   // The src0_modifier operand is present on all instructions
2009   // that have modifiers.
2010
2011   return AMDGPU::getNamedOperandIdx(Opcode,
2012                                     AMDGPU::OpName::src0_modifiers) != -1;
2013 }
2014
2015 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
2016                                   unsigned OpName) const {
2017   const MachineOperand *Mods = getNamedOperand(MI, OpName);
2018   return Mods && Mods->getImm();
2019 }
2020
2021 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
2022   return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2023          hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2024          hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2025          hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2026          hasModifiersSet(MI, AMDGPU::OpName::omod);
2027 }
2028
2029 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
2030                                   const MachineOperand &MO,
2031                                   const MCOperandInfo &OpInfo) const {
2032   // Literal constants use the constant bus.
2033   //if (isLiteralConstantLike(MO, OpInfo))
2034   // return true;
2035   if (MO.isImm())
2036     return !isInlineConstant(MO, OpInfo);
2037
2038   if (!MO.isReg())
2039     return true; // Misc other operands like FrameIndex
2040
2041   if (!MO.isUse())
2042     return false;
2043
2044   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2045     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2046
2047   // FLAT_SCR is just an SGPR pair.
2048   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2049     return true;
2050
2051   // EXEC register uses the constant bus.
2052   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2053     return true;
2054
2055   // SGPRs use the constant bus
2056   return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2057           (!MO.isImplicit() &&
2058            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2059             AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2060 }
2061
2062 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2063   for (const MachineOperand &MO : MI.implicit_operands()) {
2064     // We only care about reads.
2065     if (MO.isDef())
2066       continue;
2067
2068     switch (MO.getReg()) {
2069     case AMDGPU::VCC:
2070     case AMDGPU::M0:
2071     case AMDGPU::FLAT_SCR:
2072       return MO.getReg();
2073
2074     default:
2075       break;
2076     }
2077   }
2078
2079   return AMDGPU::NoRegister;
2080 }
2081
2082 static bool shouldReadExec(const MachineInstr &MI) {
2083   if (SIInstrInfo::isVALU(MI)) {
2084     switch (MI.getOpcode()) {
2085     case AMDGPU::V_READLANE_B32:
2086     case AMDGPU::V_READLANE_B32_si:
2087     case AMDGPU::V_READLANE_B32_vi:
2088     case AMDGPU::V_WRITELANE_B32:
2089     case AMDGPU::V_WRITELANE_B32_si:
2090     case AMDGPU::V_WRITELANE_B32_vi:
2091       return false;
2092     }
2093
2094     return true;
2095   }
2096
2097   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2098       SIInstrInfo::isSALU(MI) ||
2099       SIInstrInfo::isSMRD(MI))
2100     return false;
2101
2102   return true;
2103 }
2104
2105 static bool isSubRegOf(const SIRegisterInfo &TRI,
2106                        const MachineOperand &SuperVec,
2107                        const MachineOperand &SubReg) {
2108   if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
2109     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2110
2111   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2112          SubReg.getReg() == SuperVec.getReg();
2113 }
2114
2115 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
2116                                     StringRef &ErrInfo) const {
2117   uint16_t Opcode = MI.getOpcode();
2118   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2119   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2120   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2121   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2122
2123   // Make sure the number of operands is correct.
2124   const MCInstrDesc &Desc = get(Opcode);
2125   if (!Desc.isVariadic() &&
2126       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2127     ErrInfo = "Instruction has wrong number of operands.";
2128     return false;
2129   }
2130
2131   if (MI.isInlineAsm()) {
2132     // Verify register classes for inlineasm constraints.
2133     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2134          I != E; ++I) {
2135       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2136       if (!RC)
2137         continue;
2138
2139       const MachineOperand &Op = MI.getOperand(I);
2140       if (!Op.isReg())
2141         continue;
2142
2143       unsigned Reg = Op.getReg();
2144       if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2145         ErrInfo = "inlineasm operand has incorrect register class.";
2146         return false;
2147       }
2148     }
2149
2150     return true;
2151   }
2152
2153   // Make sure the register classes are correct.
2154   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2155     if (MI.getOperand(i).isFPImm()) {
2156       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2157                 "all fp values to integers.";
2158       return false;
2159     }
2160
2161     int RegClass = Desc.OpInfo[i].RegClass;
2162
2163     switch (Desc.OpInfo[i].OperandType) {
2164     case MCOI::OPERAND_REGISTER:
2165       if (MI.getOperand(i).isImm()) {
2166         ErrInfo = "Illegal immediate value for operand.";
2167         return false;
2168       }
2169       break;
2170     case AMDGPU::OPERAND_REG_IMM_INT32:
2171     case AMDGPU::OPERAND_REG_IMM_FP32:
2172       break;
2173     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2174     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
2175     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2176     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2177     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2178     case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2179       const MachineOperand &MO = MI.getOperand(i);
2180       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2181         ErrInfo = "Illegal immediate value for operand.";
2182         return false;
2183       }
2184       break;
2185     }
2186     case MCOI::OPERAND_IMMEDIATE:
2187     case AMDGPU::OPERAND_KIMM32:
2188       // Check if this operand is an immediate.
2189       // FrameIndex operands will be replaced by immediates, so they are
2190       // allowed.
2191       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2192         ErrInfo = "Expected immediate, but got non-immediate";
2193         return false;
2194       }
2195       LLVM_FALLTHROUGH;
2196     default:
2197       continue;
2198     }
2199
2200     if (!MI.getOperand(i).isReg())
2201       continue;
2202
2203     if (RegClass != -1) {
2204       unsigned Reg = MI.getOperand(i).getReg();
2205       if (Reg == AMDGPU::NoRegister ||
2206           TargetRegisterInfo::isVirtualRegister(Reg))
2207         continue;
2208
2209       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2210       if (!RC->contains(Reg)) {
2211         ErrInfo = "Operand has incorrect register class.";
2212         return false;
2213       }
2214     }
2215   }
2216
2217   // Verify VOP*
2218   if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
2219     // Only look at the true operands. Only a real operand can use the constant
2220     // bus, and we don't want to check pseudo-operands like the source modifier
2221     // flags.
2222     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
2223
2224     unsigned ConstantBusCount = 0;
2225
2226     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
2227       ++ConstantBusCount;
2228
2229     unsigned SGPRUsed = findImplicitSGPRRead(MI);
2230     if (SGPRUsed != AMDGPU::NoRegister)
2231       ++ConstantBusCount;
2232
2233     for (int OpIdx : OpIndices) {
2234       if (OpIdx == -1)
2235         break;
2236       const MachineOperand &MO = MI.getOperand(OpIdx);
2237       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
2238         if (MO.isReg()) {
2239           if (MO.getReg() != SGPRUsed)
2240             ++ConstantBusCount;
2241           SGPRUsed = MO.getReg();
2242         } else {
2243           ++ConstantBusCount;
2244         }
2245       }
2246     }
2247     if (ConstantBusCount > 1) {
2248       ErrInfo = "VOP* instruction uses the constant bus more than once";
2249       return false;
2250     }
2251   }
2252
2253   // Verify misc. restrictions on specific instructions.
2254   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
2255       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
2256     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2257     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
2258     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
2259     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
2260       if (!compareMachineOp(Src0, Src1) &&
2261           !compareMachineOp(Src0, Src2)) {
2262         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
2263         return false;
2264       }
2265     }
2266   }
2267
2268   if (isSOPK(MI)) {
2269     int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
2270     if (sopkIsZext(MI)) {
2271       if (!isUInt<16>(Imm)) {
2272         ErrInfo = "invalid immediate for SOPK instruction";
2273         return false;
2274       }
2275     } else {
2276       if (!isInt<16>(Imm)) {
2277         ErrInfo = "invalid immediate for SOPK instruction";
2278         return false;
2279       }
2280     }
2281   }
2282
2283   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
2284       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
2285       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2286       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
2287     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2288                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
2289
2290     const unsigned StaticNumOps = Desc.getNumOperands() +
2291       Desc.getNumImplicitUses();
2292     const unsigned NumImplicitOps = IsDst ? 2 : 1;
2293
2294     // Allow additional implicit operands. This allows a fixup done by the post
2295     // RA scheduler where the main implicit operand is killed and implicit-defs
2296     // are added for sub-registers that remain live after this instruction.
2297     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
2298       ErrInfo = "missing implicit register operands";
2299       return false;
2300     }
2301
2302     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2303     if (IsDst) {
2304       if (!Dst->isUse()) {
2305         ErrInfo = "v_movreld_b32 vdst should be a use operand";
2306         return false;
2307       }
2308
2309       unsigned UseOpIdx;
2310       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
2311           UseOpIdx != StaticNumOps + 1) {
2312         ErrInfo = "movrel implicit operands should be tied";
2313         return false;
2314       }
2315     }
2316
2317     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2318     const MachineOperand &ImpUse
2319       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
2320     if (!ImpUse.isReg() || !ImpUse.isUse() ||
2321         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
2322       ErrInfo = "src0 should be subreg of implicit vector use";
2323       return false;
2324     }
2325   }
2326
2327   // Make sure we aren't losing exec uses in the td files. This mostly requires
2328   // being careful when using let Uses to try to add other use registers.
2329   if (shouldReadExec(MI)) {
2330     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
2331       ErrInfo = "VALU instruction does not implicitly read exec mask";
2332       return false;
2333     }
2334   }
2335
2336   if (isSMRD(MI)) {
2337     if (MI.mayStore()) {
2338       // The register offset form of scalar stores may only use m0 as the
2339       // soffset register.
2340       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
2341       if (Soff && Soff->getReg() != AMDGPU::M0) {
2342         ErrInfo = "scalar stores must use m0 as offset register";
2343         return false;
2344       }
2345     }
2346   }
2347
2348   return true;
2349 }
2350
2351 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
2352   switch (MI.getOpcode()) {
2353   default: return AMDGPU::INSTRUCTION_LIST_END;
2354   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
2355   case AMDGPU::COPY: return AMDGPU::COPY;
2356   case AMDGPU::PHI: return AMDGPU::PHI;
2357   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
2358   case AMDGPU::S_MOV_B32:
2359     return MI.getOperand(1).isReg() ?
2360            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
2361   case AMDGPU::S_ADD_I32:
2362   case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
2363   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
2364   case AMDGPU::S_SUB_I32:
2365   case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
2366   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
2367   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
2368   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
2369   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
2370   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
2371   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
2372   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
2373   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
2374   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
2375   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
2376   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
2377   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
2378   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
2379   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
2380   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
2381   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
2382   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
2383   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
2384   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
2385   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
2386   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
2387   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
2388   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
2389   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
2390   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
2391   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
2392   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
2393   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
2394   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
2395   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
2396   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
2397   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
2398   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
2399   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
2400   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
2401   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
2402   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
2403   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
2404   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
2405   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
2406   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
2407   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
2408   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
2409   }
2410 }
2411
2412 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
2413   return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
2414 }
2415
2416 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
2417                                                       unsigned OpNo) const {
2418   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2419   const MCInstrDesc &Desc = get(MI.getOpcode());
2420   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
2421       Desc.OpInfo[OpNo].RegClass == -1) {
2422     unsigned Reg = MI.getOperand(OpNo).getReg();
2423
2424     if (TargetRegisterInfo::isVirtualRegister(Reg))
2425       return MRI.getRegClass(Reg);
2426     return RI.getPhysRegClass(Reg);
2427   }
2428
2429   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
2430   return RI.getRegClass(RCID);
2431 }
2432
2433 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
2434   switch (MI.getOpcode()) {
2435   case AMDGPU::COPY:
2436   case AMDGPU::REG_SEQUENCE:
2437   case AMDGPU::PHI:
2438   case AMDGPU::INSERT_SUBREG:
2439     return RI.hasVGPRs(getOpRegClass(MI, 0));
2440   default:
2441     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
2442   }
2443 }
2444
2445 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
2446   MachineBasicBlock::iterator I = MI;
2447   MachineBasicBlock *MBB = MI.getParent();
2448   MachineOperand &MO = MI.getOperand(OpIdx);
2449   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
2450   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
2451   const TargetRegisterClass *RC = RI.getRegClass(RCID);
2452   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
2453   if (MO.isReg())
2454     Opcode = AMDGPU::COPY;
2455   else if (RI.isSGPRClass(RC))
2456     Opcode = AMDGPU::S_MOV_B32;
2457
2458   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
2459   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
2460     VRC = &AMDGPU::VReg_64RegClass;
2461   else
2462     VRC = &AMDGPU::VGPR_32RegClass;
2463
2464   unsigned Reg = MRI.createVirtualRegister(VRC);
2465   DebugLoc DL = MBB->findDebugLoc(I);
2466   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
2467   MO.ChangeToRegister(Reg, false);
2468 }
2469
2470 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
2471                                          MachineRegisterInfo &MRI,
2472                                          MachineOperand &SuperReg,
2473                                          const TargetRegisterClass *SuperRC,
2474                                          unsigned SubIdx,
2475                                          const TargetRegisterClass *SubRC)
2476                                          const {
2477   MachineBasicBlock *MBB = MI->getParent();
2478   DebugLoc DL = MI->getDebugLoc();
2479   unsigned SubReg = MRI.createVirtualRegister(SubRC);
2480
2481   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
2482     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2483       .addReg(SuperReg.getReg(), 0, SubIdx);
2484     return SubReg;
2485   }
2486
2487   // Just in case the super register is itself a sub-register, copy it to a new
2488   // value so we don't need to worry about merging its subreg index with the
2489   // SubIdx passed to this function. The register coalescer should be able to
2490   // eliminate this extra copy.
2491   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
2492
2493   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
2494     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
2495
2496   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2497     .addReg(NewSuperReg, 0, SubIdx);
2498
2499   return SubReg;
2500 }
2501
2502 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
2503   MachineBasicBlock::iterator MII,
2504   MachineRegisterInfo &MRI,
2505   MachineOperand &Op,
2506   const TargetRegisterClass *SuperRC,
2507   unsigned SubIdx,
2508   const TargetRegisterClass *SubRC) const {
2509   if (Op.isImm()) {
2510     if (SubIdx == AMDGPU::sub0)
2511       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
2512     if (SubIdx == AMDGPU::sub1)
2513       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
2514
2515     llvm_unreachable("Unhandled register index for immediate");
2516   }
2517
2518   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
2519                                        SubIdx, SubRC);
2520   return MachineOperand::CreateReg(SubReg, false);
2521 }
2522
2523 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
2524 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
2525   assert(Inst.getNumExplicitOperands() == 3);
2526   MachineOperand Op1 = Inst.getOperand(1);
2527   Inst.RemoveOperand(1);
2528   Inst.addOperand(Op1);
2529 }
2530
2531 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
2532                                     const MCOperandInfo &OpInfo,
2533                                     const MachineOperand &MO) const {
2534   if (!MO.isReg())
2535     return false;
2536
2537   unsigned Reg = MO.getReg();
2538   const TargetRegisterClass *RC =
2539     TargetRegisterInfo::isVirtualRegister(Reg) ?
2540     MRI.getRegClass(Reg) :
2541     RI.getPhysRegClass(Reg);
2542
2543   const SIRegisterInfo *TRI =
2544       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
2545   RC = TRI->getSubRegClass(RC, MO.getSubReg());
2546
2547   // In order to be legal, the common sub-class must be equal to the
2548   // class of the current operand.  For example:
2549   //
2550   // v_mov_b32 s0 ; Operand defined as vsrc_b32
2551   //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
2552   //
2553   // s_sendmsg 0, s0 ; Operand defined as m0reg
2554   //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
2555
2556   return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
2557 }
2558
2559 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
2560                                      const MCOperandInfo &OpInfo,
2561                                      const MachineOperand &MO) const {
2562   if (MO.isReg())
2563     return isLegalRegOperand(MRI, OpInfo, MO);
2564
2565   // Handle non-register types that are treated like immediates.
2566   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2567   return true;
2568 }
2569
2570 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
2571                                  const MachineOperand *MO) const {
2572   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2573   const MCInstrDesc &InstDesc = MI.getDesc();
2574   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
2575   const TargetRegisterClass *DefinedRC =
2576       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
2577   if (!MO)
2578     MO = &MI.getOperand(OpIdx);
2579
2580   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
2581
2582     RegSubRegPair SGPRUsed;
2583     if (MO->isReg())
2584       SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
2585
2586     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2587       if (i == OpIdx)
2588         continue;
2589       const MachineOperand &Op = MI.getOperand(i);
2590       if (Op.isReg()) {
2591         if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
2592             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
2593           return false;
2594         }
2595       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
2596         return false;
2597       }
2598     }
2599   }
2600
2601   if (MO->isReg()) {
2602     assert(DefinedRC);
2603     return isLegalRegOperand(MRI, OpInfo, *MO);
2604   }
2605
2606   // Handle non-register types that are treated like immediates.
2607   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
2608
2609   if (!DefinedRC) {
2610     // This operand expects an immediate.
2611     return true;
2612   }
2613
2614   return isImmOperandLegal(MI, OpIdx, *MO);
2615 }
2616
2617 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
2618                                        MachineInstr &MI) const {
2619   unsigned Opc = MI.getOpcode();
2620   const MCInstrDesc &InstrDesc = get(Opc);
2621
2622   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2623   MachineOperand &Src1 = MI.getOperand(Src1Idx);
2624
2625   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
2626   // we need to only have one constant bus use.
2627   //
2628   // Note we do not need to worry about literal constants here. They are
2629   // disabled for the operand type for instructions because they will always
2630   // violate the one constant bus use rule.
2631   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
2632   if (HasImplicitSGPR) {
2633     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2634     MachineOperand &Src0 = MI.getOperand(Src0Idx);
2635
2636     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
2637       legalizeOpWithMove(MI, Src0Idx);
2638   }
2639
2640   // VOP2 src0 instructions support all operand types, so we don't need to check
2641   // their legality. If src1 is already legal, we don't need to do anything.
2642   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
2643     return;
2644
2645   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
2646   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
2647   // select is uniform.
2648   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
2649       RI.isVGPR(MRI, Src1.getReg())) {
2650     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2651     const DebugLoc &DL = MI.getDebugLoc();
2652     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
2653         .add(Src1);
2654     Src1.ChangeToRegister(Reg, false);
2655     return;
2656   }
2657
2658   // We do not use commuteInstruction here because it is too aggressive and will
2659   // commute if it is possible. We only want to commute here if it improves
2660   // legality. This can be called a fairly large number of times so don't waste
2661   // compile time pointlessly swapping and checking legality again.
2662   if (HasImplicitSGPR || !MI.isCommutable()) {
2663     legalizeOpWithMove(MI, Src1Idx);
2664     return;
2665   }
2666
2667   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2668   MachineOperand &Src0 = MI.getOperand(Src0Idx);
2669
2670   // If src0 can be used as src1, commuting will make the operands legal.
2671   // Otherwise we have to give up and insert a move.
2672   //
2673   // TODO: Other immediate-like operand kinds could be commuted if there was a
2674   // MachineOperand::ChangeTo* for them.
2675   if ((!Src1.isImm() && !Src1.isReg()) ||
2676       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
2677     legalizeOpWithMove(MI, Src1Idx);
2678     return;
2679   }
2680
2681   int CommutedOpc = commuteOpcode(MI);
2682   if (CommutedOpc == -1) {
2683     legalizeOpWithMove(MI, Src1Idx);
2684     return;
2685   }
2686
2687   MI.setDesc(get(CommutedOpc));
2688
2689   unsigned Src0Reg = Src0.getReg();
2690   unsigned Src0SubReg = Src0.getSubReg();
2691   bool Src0Kill = Src0.isKill();
2692
2693   if (Src1.isImm())
2694     Src0.ChangeToImmediate(Src1.getImm());
2695   else if (Src1.isReg()) {
2696     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
2697     Src0.setSubReg(Src1.getSubReg());
2698   } else
2699     llvm_unreachable("Should only have register or immediate operands");
2700
2701   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
2702   Src1.setSubReg(Src0SubReg);
2703 }
2704
2705 // Legalize VOP3 operands. Because all operand types are supported for any
2706 // operand, and since literal constants are not allowed and should never be
2707 // seen, we only need to worry about inserting copies if we use multiple SGPR
2708 // operands.
2709 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
2710                                        MachineInstr &MI) const {
2711   unsigned Opc = MI.getOpcode();
2712
2713   int VOP3Idx[3] = {
2714     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
2715     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
2716     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
2717   };
2718
2719   // Find the one SGPR operand we are allowed to use.
2720   unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
2721
2722   for (unsigned i = 0; i < 3; ++i) {
2723     int Idx = VOP3Idx[i];
2724     if (Idx == -1)
2725       break;
2726     MachineOperand &MO = MI.getOperand(Idx);
2727
2728     // We should never see a VOP3 instruction with an illegal immediate operand.
2729     if (!MO.isReg())
2730       continue;
2731
2732     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
2733       continue; // VGPRs are legal
2734
2735     if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
2736       SGPRReg = MO.getReg();
2737       // We can use one SGPR in each VOP3 instruction.
2738       continue;
2739     }
2740
2741     // If we make it this far, then the operand is not legal and we must
2742     // legalize it.
2743     legalizeOpWithMove(MI, Idx);
2744   }
2745 }
2746
2747 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
2748                                          MachineRegisterInfo &MRI) const {
2749   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
2750   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
2751   unsigned DstReg = MRI.createVirtualRegister(SRC);
2752   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
2753
2754   SmallVector<unsigned, 8> SRegs;
2755   for (unsigned i = 0; i < SubRegs; ++i) {
2756     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2757     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
2758             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
2759         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
2760     SRegs.push_back(SGPR);
2761   }
2762
2763   MachineInstrBuilder MIB =
2764       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
2765               get(AMDGPU::REG_SEQUENCE), DstReg);
2766   for (unsigned i = 0; i < SubRegs; ++i) {
2767     MIB.addReg(SRegs[i]);
2768     MIB.addImm(RI.getSubRegFromChannel(i));
2769   }
2770   return DstReg;
2771 }
2772
2773 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
2774                                        MachineInstr &MI) const {
2775
2776   // If the pointer is store in VGPRs, then we need to move them to
2777   // SGPRs using v_readfirstlane.  This is safe because we only select
2778   // loads with uniform pointers to SMRD instruction so we know the
2779   // pointer value is uniform.
2780   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
2781   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
2782       unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
2783       SBase->setReg(SGPR);
2784   }
2785 }
2786
2787 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
2788                                          MachineBasicBlock::iterator I,
2789                                          const TargetRegisterClass *DstRC,
2790                                          MachineOperand &Op,
2791                                          MachineRegisterInfo &MRI,
2792                                          const DebugLoc &DL) const {
2793
2794   unsigned OpReg = Op.getReg();
2795   unsigned OpSubReg = Op.getSubReg();
2796
2797   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
2798       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
2799
2800   // Check if operand is already the correct register class.
2801   if (DstRC == OpRC)
2802     return;
2803
2804   unsigned DstReg = MRI.createVirtualRegister(DstRC);
2805   MachineInstr *Copy =
2806       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
2807
2808   Op.setReg(DstReg);
2809   Op.setSubReg(0);
2810
2811   MachineInstr *Def = MRI.getVRegDef(OpReg);
2812   if (!Def)
2813     return;
2814
2815   // Try to eliminate the copy if it is copying an immediate value.
2816   if (Def->isMoveImmediate())
2817     FoldImmediate(*Copy, *Def, OpReg, &MRI);
2818 }
2819
2820 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
2821   MachineFunction &MF = *MI.getParent()->getParent();
2822   MachineRegisterInfo &MRI = MF.getRegInfo();
2823
2824   // Legalize VOP2
2825   if (isVOP2(MI) || isVOPC(MI)) {
2826     legalizeOperandsVOP2(MRI, MI);
2827     return;
2828   }
2829
2830   // Legalize VOP3
2831   if (isVOP3(MI)) {
2832     legalizeOperandsVOP3(MRI, MI);
2833     return;
2834   }
2835
2836   // Legalize SMRD
2837   if (isSMRD(MI)) {
2838     legalizeOperandsSMRD(MRI, MI);
2839     return;
2840   }
2841
2842   // Legalize REG_SEQUENCE and PHI
2843   // The register class of the operands much be the same type as the register
2844   // class of the output.
2845   if (MI.getOpcode() == AMDGPU::PHI) {
2846     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
2847     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
2848       if (!MI.getOperand(i).isReg() ||
2849           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
2850         continue;
2851       const TargetRegisterClass *OpRC =
2852           MRI.getRegClass(MI.getOperand(i).getReg());
2853       if (RI.hasVGPRs(OpRC)) {
2854         VRC = OpRC;
2855       } else {
2856         SRC = OpRC;
2857       }
2858     }
2859
2860     // If any of the operands are VGPR registers, then they all most be
2861     // otherwise we will create illegal VGPR->SGPR copies when legalizing
2862     // them.
2863     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
2864       if (!VRC) {
2865         assert(SRC);
2866         VRC = RI.getEquivalentVGPRClass(SRC);
2867       }
2868       RC = VRC;
2869     } else {
2870       RC = SRC;
2871     }
2872
2873     // Update all the operands so they have the same type.
2874     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2875       MachineOperand &Op = MI.getOperand(I);
2876       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
2877         continue;
2878
2879       // MI is a PHI instruction.
2880       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
2881       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
2882
2883       // Avoid creating no-op copies with the same src and dst reg class.  These
2884       // confuse some of the machine passes.
2885       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
2886     }
2887   }
2888
2889   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
2890   // VGPR dest type and SGPR sources, insert copies so all operands are
2891   // VGPRs. This seems to help operand folding / the register coalescer.
2892   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
2893     MachineBasicBlock *MBB = MI.getParent();
2894     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
2895     if (RI.hasVGPRs(DstRC)) {
2896       // Update all the operands so they are VGPR register classes. These may
2897       // not be the same register class because REG_SEQUENCE supports mixing
2898       // subregister index types e.g. sub0_sub1 + sub2 + sub3
2899       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2900         MachineOperand &Op = MI.getOperand(I);
2901         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
2902           continue;
2903
2904         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
2905         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
2906         if (VRC == OpRC)
2907           continue;
2908
2909         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
2910         Op.setIsKill();
2911       }
2912     }
2913
2914     return;
2915   }
2916
2917   // Legalize INSERT_SUBREG
2918   // src0 must have the same register class as dst
2919   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
2920     unsigned Dst = MI.getOperand(0).getReg();
2921     unsigned Src0 = MI.getOperand(1).getReg();
2922     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
2923     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
2924     if (DstRC != Src0RC) {
2925       MachineBasicBlock *MBB = MI.getParent();
2926       MachineOperand &Op = MI.getOperand(1);
2927       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
2928     }
2929     return;
2930   }
2931
2932   // Legalize MIMG and MUBUF/MTBUF for shaders.
2933   //
2934   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
2935   // scratch memory access. In both cases, the legalization never involves
2936   // conversion to the addr64 form.
2937   if (isMIMG(MI) ||
2938       (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
2939        (isMUBUF(MI) || isMTBUF(MI)))) {
2940     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
2941     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
2942       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
2943       SRsrc->setReg(SGPR);
2944     }
2945
2946     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
2947     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
2948       unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
2949       SSamp->setReg(SGPR);
2950     }
2951     return;
2952   }
2953
2954   // Legalize MUBUF* instructions by converting to addr64 form.
2955   // FIXME: If we start using the non-addr64 instructions for compute, we
2956   // may need to legalize them as above. This especially applies to the
2957   // buffer_load_format_* variants and variants with idxen (or bothen).
2958   int SRsrcIdx =
2959       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
2960   if (SRsrcIdx != -1) {
2961     // We have an MUBUF instruction
2962     MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
2963     unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
2964     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
2965                                              RI.getRegClass(SRsrcRC))) {
2966       // The operands are legal.
2967       // FIXME: We may need to legalize operands besided srsrc.
2968       return;
2969     }
2970
2971     MachineBasicBlock &MBB = *MI.getParent();
2972
2973     // Extract the ptr from the resource descriptor.
2974     unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
2975       &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
2976
2977     // Create an empty resource descriptor
2978     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2979     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2980     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2981     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
2982     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
2983
2984     // Zero64 = 0
2985     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
2986         .addImm(0);
2987
2988     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
2989     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
2990         .addImm(RsrcDataFormat & 0xFFFFFFFF);
2991
2992     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
2993     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
2994         .addImm(RsrcDataFormat >> 32);
2995
2996     // NewSRsrc = {Zero64, SRsrcFormat}
2997     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
2998         .addReg(Zero64)
2999         .addImm(AMDGPU::sub0_sub1)
3000         .addReg(SRsrcFormatLo)
3001         .addImm(AMDGPU::sub2)
3002         .addReg(SRsrcFormatHi)
3003         .addImm(AMDGPU::sub3);
3004
3005     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
3006     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3007     if (VAddr) {
3008       // This is already an ADDR64 instruction so we need to add the pointer
3009       // extracted from the resource descriptor to the current value of VAddr.
3010       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3011       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3012
3013       // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
3014       DebugLoc DL = MI.getDebugLoc();
3015       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
3016         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3017         .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
3018
3019       // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
3020       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
3021         .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3022         .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
3023
3024       // NewVaddr = {NewVaddrHi, NewVaddrLo}
3025       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
3026           .addReg(NewVAddrLo)
3027           .addImm(AMDGPU::sub0)
3028           .addReg(NewVAddrHi)
3029           .addImm(AMDGPU::sub1);
3030     } else {
3031       // This instructions is the _OFFSET variant, so we need to convert it to
3032       // ADDR64.
3033       assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
3034              < SISubtarget::VOLCANIC_ISLANDS &&
3035              "FIXME: Need to emit flat atomics here");
3036
3037       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
3038       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3039       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
3040       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
3041
3042       // Atomics rith return have have an additional tied operand and are
3043       // missing some of the special bits.
3044       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
3045       MachineInstr *Addr64;
3046
3047       if (!VDataIn) {
3048         // Regular buffer load / store.
3049         MachineInstrBuilder MIB =
3050             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3051                 .add(*VData)
3052                 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3053                 // This will be replaced later
3054                 // with the new value of vaddr.
3055                 .add(*SRsrc)
3056                 .add(*SOffset)
3057                 .add(*Offset);
3058
3059         // Atomics do not have this operand.
3060         if (const MachineOperand *GLC =
3061                 getNamedOperand(MI, AMDGPU::OpName::glc)) {
3062           MIB.addImm(GLC->getImm());
3063         }
3064
3065         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
3066
3067         if (const MachineOperand *TFE =
3068                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
3069           MIB.addImm(TFE->getImm());
3070         }
3071
3072         MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3073         Addr64 = MIB;
3074       } else {
3075         // Atomics with return.
3076         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3077                      .add(*VData)
3078                      .add(*VDataIn)
3079                      .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3080                      // This will be replaced later
3081                      // with the new value of vaddr.
3082                      .add(*SRsrc)
3083                      .add(*SOffset)
3084                      .add(*Offset)
3085                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
3086                      .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3087       }
3088
3089       MI.removeFromParent();
3090
3091       // NewVaddr = {NewVaddrHi, NewVaddrLo}
3092       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
3093               NewVAddr)
3094           .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3095           .addImm(AMDGPU::sub0)
3096           .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3097           .addImm(AMDGPU::sub1);
3098
3099       VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
3100       SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
3101     }
3102
3103     // Update the instruction to use NewVaddr
3104     VAddr->setReg(NewVAddr);
3105     // Update the instruction to use NewSRsrc
3106     SRsrc->setReg(NewSRsrc);
3107   }
3108 }
3109
3110 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
3111   SmallVector<MachineInstr *, 128> Worklist;
3112   Worklist.push_back(&TopInst);
3113
3114   while (!Worklist.empty()) {
3115     MachineInstr &Inst = *Worklist.pop_back_val();
3116     MachineBasicBlock *MBB = Inst.getParent();
3117     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
3118
3119     unsigned Opcode = Inst.getOpcode();
3120     unsigned NewOpcode = getVALUOp(Inst);
3121
3122     // Handle some special cases
3123     switch (Opcode) {
3124     default:
3125       break;
3126     case AMDGPU::S_AND_B64:
3127       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
3128       Inst.eraseFromParent();
3129       continue;
3130
3131     case AMDGPU::S_OR_B64:
3132       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
3133       Inst.eraseFromParent();
3134       continue;
3135
3136     case AMDGPU::S_XOR_B64:
3137       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
3138       Inst.eraseFromParent();
3139       continue;
3140
3141     case AMDGPU::S_NOT_B64:
3142       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
3143       Inst.eraseFromParent();
3144       continue;
3145
3146     case AMDGPU::S_BCNT1_I32_B64:
3147       splitScalar64BitBCNT(Worklist, Inst);
3148       Inst.eraseFromParent();
3149       continue;
3150
3151     case AMDGPU::S_BFE_I64: {
3152       splitScalar64BitBFE(Worklist, Inst);
3153       Inst.eraseFromParent();
3154       continue;
3155     }
3156
3157     case AMDGPU::S_LSHL_B32:
3158       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3159         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
3160         swapOperands(Inst);
3161       }
3162       break;
3163     case AMDGPU::S_ASHR_I32:
3164       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3165         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
3166         swapOperands(Inst);
3167       }
3168       break;
3169     case AMDGPU::S_LSHR_B32:
3170       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3171         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
3172         swapOperands(Inst);
3173       }
3174       break;
3175     case AMDGPU::S_LSHL_B64:
3176       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3177         NewOpcode = AMDGPU::V_LSHLREV_B64;
3178         swapOperands(Inst);
3179       }
3180       break;
3181     case AMDGPU::S_ASHR_I64:
3182       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3183         NewOpcode = AMDGPU::V_ASHRREV_I64;
3184         swapOperands(Inst);
3185       }
3186       break;
3187     case AMDGPU::S_LSHR_B64:
3188       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3189         NewOpcode = AMDGPU::V_LSHRREV_B64;
3190         swapOperands(Inst);
3191       }
3192       break;
3193
3194     case AMDGPU::S_ABS_I32:
3195       lowerScalarAbs(Worklist, Inst);
3196       Inst.eraseFromParent();
3197       continue;
3198
3199     case AMDGPU::S_CBRANCH_SCC0:
3200     case AMDGPU::S_CBRANCH_SCC1:
3201       // Clear unused bits of vcc
3202       BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
3203               AMDGPU::VCC)
3204           .addReg(AMDGPU::EXEC)
3205           .addReg(AMDGPU::VCC);
3206       break;
3207
3208     case AMDGPU::S_BFE_U64:
3209     case AMDGPU::S_BFM_B64:
3210       llvm_unreachable("Moving this op to VALU not implemented");
3211
3212     case AMDGPU::S_PACK_LL_B32_B16:
3213     case AMDGPU::S_PACK_LH_B32_B16:
3214     case AMDGPU::S_PACK_HH_B32_B16: {
3215       movePackToVALU(Worklist, MRI, Inst);
3216       Inst.eraseFromParent();
3217       continue;
3218     }
3219     }
3220
3221     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
3222       // We cannot move this instruction to the VALU, so we should try to
3223       // legalize its operands instead.
3224       legalizeOperands(Inst);
3225       continue;
3226     }
3227
3228     // Use the new VALU Opcode.
3229     const MCInstrDesc &NewDesc = get(NewOpcode);
3230     Inst.setDesc(NewDesc);
3231
3232     // Remove any references to SCC. Vector instructions can't read from it, and
3233     // We're just about to add the implicit use / defs of VCC, and we don't want
3234     // both.
3235     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
3236       MachineOperand &Op = Inst.getOperand(i);
3237       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
3238         Inst.RemoveOperand(i);
3239         addSCCDefUsersToVALUWorklist(Inst, Worklist);
3240       }
3241     }
3242
3243     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
3244       // We are converting these to a BFE, so we need to add the missing
3245       // operands for the size and offset.
3246       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
3247       Inst.addOperand(MachineOperand::CreateImm(0));
3248       Inst.addOperand(MachineOperand::CreateImm(Size));
3249
3250     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
3251       // The VALU version adds the second operand to the result, so insert an
3252       // extra 0 operand.
3253       Inst.addOperand(MachineOperand::CreateImm(0));
3254     }
3255
3256     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
3257
3258     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
3259       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
3260       // If we need to move this to VGPRs, we need to unpack the second operand
3261       // back into the 2 separate ones for bit offset and width.
3262       assert(OffsetWidthOp.isImm() &&
3263              "Scalar BFE is only implemented for constant width and offset");
3264       uint32_t Imm = OffsetWidthOp.getImm();
3265
3266       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3267       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3268       Inst.RemoveOperand(2);                     // Remove old immediate.
3269       Inst.addOperand(MachineOperand::CreateImm(Offset));
3270       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
3271     }
3272
3273     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
3274     unsigned NewDstReg = AMDGPU::NoRegister;
3275     if (HasDst) {
3276       unsigned DstReg = Inst.getOperand(0).getReg();
3277       if (TargetRegisterInfo::isPhysicalRegister(DstReg))
3278         continue;
3279
3280       // Update the destination register class.
3281       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
3282       if (!NewDstRC)
3283         continue;
3284
3285       if (Inst.isCopy() &&
3286           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
3287           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
3288         // Instead of creating a copy where src and dst are the same register
3289         // class, we just replace all uses of dst with src.  These kinds of
3290         // copies interfere with the heuristics MachineSink uses to decide
3291         // whether or not to split a critical edge.  Since the pass assumes
3292         // that copies will end up as machine instructions and not be
3293         // eliminated.
3294         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
3295         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
3296         MRI.clearKillFlags(Inst.getOperand(1).getReg());
3297         Inst.getOperand(0).setReg(DstReg);
3298         continue;
3299       }
3300
3301       NewDstReg = MRI.createVirtualRegister(NewDstRC);
3302       MRI.replaceRegWith(DstReg, NewDstReg);
3303     }
3304
3305     // Legalize the operands
3306     legalizeOperands(Inst);
3307
3308     if (HasDst)
3309      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
3310   }
3311 }
3312
3313 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
3314                                  MachineInstr &Inst) const {
3315   MachineBasicBlock &MBB = *Inst.getParent();
3316   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3317   MachineBasicBlock::iterator MII = Inst;
3318   DebugLoc DL = Inst.getDebugLoc();
3319
3320   MachineOperand &Dest = Inst.getOperand(0);
3321   MachineOperand &Src = Inst.getOperand(1);
3322   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3323   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3324
3325   BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
3326     .addImm(0)
3327     .addReg(Src.getReg());
3328
3329   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
3330     .addReg(Src.getReg())
3331     .addReg(TmpReg);
3332
3333   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3334   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3335 }
3336
3337 void SIInstrInfo::splitScalar64BitUnaryOp(
3338     SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
3339     unsigned Opcode) const {
3340   MachineBasicBlock &MBB = *Inst.getParent();
3341   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3342
3343   MachineOperand &Dest = Inst.getOperand(0);
3344   MachineOperand &Src0 = Inst.getOperand(1);
3345   DebugLoc DL = Inst.getDebugLoc();
3346
3347   MachineBasicBlock::iterator MII = Inst;
3348
3349   const MCInstrDesc &InstDesc = get(Opcode);
3350   const TargetRegisterClass *Src0RC = Src0.isReg() ?
3351     MRI.getRegClass(Src0.getReg()) :
3352     &AMDGPU::SGPR_32RegClass;
3353
3354   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3355
3356   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3357                                                        AMDGPU::sub0, Src0SubRC);
3358
3359   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3360   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3361   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3362
3363   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3364   BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
3365
3366   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3367                                                        AMDGPU::sub1, Src0SubRC);
3368
3369   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3370   BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
3371
3372   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3373   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3374     .addReg(DestSub0)
3375     .addImm(AMDGPU::sub0)
3376     .addReg(DestSub1)
3377     .addImm(AMDGPU::sub1);
3378
3379   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3380
3381   // We don't need to legalizeOperands here because for a single operand, src0
3382   // will support any kind of input.
3383
3384   // Move all users of this moved value.
3385   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3386 }
3387
3388 void SIInstrInfo::splitScalar64BitBinaryOp(
3389     SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
3390     unsigned Opcode) const {
3391   MachineBasicBlock &MBB = *Inst.getParent();
3392   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3393
3394   MachineOperand &Dest = Inst.getOperand(0);
3395   MachineOperand &Src0 = Inst.getOperand(1);
3396   MachineOperand &Src1 = Inst.getOperand(2);
3397   DebugLoc DL = Inst.getDebugLoc();
3398
3399   MachineBasicBlock::iterator MII = Inst;
3400
3401   const MCInstrDesc &InstDesc = get(Opcode);
3402   const TargetRegisterClass *Src0RC = Src0.isReg() ?
3403     MRI.getRegClass(Src0.getReg()) :
3404     &AMDGPU::SGPR_32RegClass;
3405
3406   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3407   const TargetRegisterClass *Src1RC = Src1.isReg() ?
3408     MRI.getRegClass(Src1.getReg()) :
3409     &AMDGPU::SGPR_32RegClass;
3410
3411   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
3412
3413   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3414                                                        AMDGPU::sub0, Src0SubRC);
3415   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3416                                                        AMDGPU::sub0, Src1SubRC);
3417
3418   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3419   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3420   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3421
3422   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3423   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
3424                               .add(SrcReg0Sub0)
3425                               .add(SrcReg1Sub0);
3426
3427   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3428                                                        AMDGPU::sub1, Src0SubRC);
3429   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3430                                                        AMDGPU::sub1, Src1SubRC);
3431
3432   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3433   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
3434                               .add(SrcReg0Sub1)
3435                               .add(SrcReg1Sub1);
3436
3437   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3438   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3439     .addReg(DestSub0)
3440     .addImm(AMDGPU::sub0)
3441     .addReg(DestSub1)
3442     .addImm(AMDGPU::sub1);
3443
3444   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3445
3446   // Try to legalize the operands in case we need to swap the order to keep it
3447   // valid.
3448   legalizeOperands(LoHalf);
3449   legalizeOperands(HiHalf);
3450
3451   // Move all users of this moved vlaue.
3452   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3453 }
3454
3455 void SIInstrInfo::splitScalar64BitBCNT(
3456     SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
3457   MachineBasicBlock &MBB = *Inst.getParent();
3458   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3459
3460   MachineBasicBlock::iterator MII = Inst;
3461   DebugLoc DL = Inst.getDebugLoc();
3462
3463   MachineOperand &Dest = Inst.getOperand(0);
3464   MachineOperand &Src = Inst.getOperand(1);
3465
3466   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
3467   const TargetRegisterClass *SrcRC = Src.isReg() ?
3468     MRI.getRegClass(Src.getReg()) :
3469     &AMDGPU::SGPR_32RegClass;
3470
3471   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3472   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3473
3474   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
3475
3476   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3477                                                       AMDGPU::sub0, SrcSubRC);
3478   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3479                                                       AMDGPU::sub1, SrcSubRC);
3480
3481   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
3482
3483   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
3484
3485   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3486
3487   // We don't need to legalize operands here. src0 for etiher instruction can be
3488   // an SGPR, and the second input is unused or determined here.
3489   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3490 }
3491
3492 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
3493                                       MachineInstr &Inst) const {
3494   MachineBasicBlock &MBB = *Inst.getParent();
3495   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3496   MachineBasicBlock::iterator MII = Inst;
3497   DebugLoc DL = Inst.getDebugLoc();
3498
3499   MachineOperand &Dest = Inst.getOperand(0);
3500   uint32_t Imm = Inst.getOperand(2).getImm();
3501   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3502   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3503
3504   (void) Offset;
3505
3506   // Only sext_inreg cases handled.
3507   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
3508          Offset == 0 && "Not implemented");
3509
3510   if (BitWidth < 32) {
3511     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3512     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3513     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3514
3515     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
3516         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
3517         .addImm(0)
3518         .addImm(BitWidth);
3519
3520     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
3521       .addImm(31)
3522       .addReg(MidRegLo);
3523
3524     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
3525       .addReg(MidRegLo)
3526       .addImm(AMDGPU::sub0)
3527       .addReg(MidRegHi)
3528       .addImm(AMDGPU::sub1);
3529
3530     MRI.replaceRegWith(Dest.getReg(), ResultReg);
3531     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3532     return;
3533   }
3534
3535   MachineOperand &Src = Inst.getOperand(1);
3536   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3537   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3538
3539   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
3540     .addImm(31)
3541     .addReg(Src.getReg(), 0, AMDGPU::sub0);
3542
3543   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
3544     .addReg(Src.getReg(), 0, AMDGPU::sub0)
3545     .addImm(AMDGPU::sub0)
3546     .addReg(TmpReg)
3547     .addImm(AMDGPU::sub1);
3548
3549   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3550   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3551 }
3552
3553 void SIInstrInfo::addUsersToMoveToVALUWorklist(
3554   unsigned DstReg,
3555   MachineRegisterInfo &MRI,
3556   SmallVectorImpl<MachineInstr *> &Worklist) const {
3557   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
3558          E = MRI.use_end(); I != E;) {
3559     MachineInstr &UseMI = *I->getParent();
3560     if (!canReadVGPR(UseMI, I.getOperandNo())) {
3561       Worklist.push_back(&UseMI);
3562
3563       do {
3564         ++I;
3565       } while (I != E && I->getParent() == &UseMI);
3566     } else {
3567       ++I;
3568     }
3569   }
3570 }
3571
3572 void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
3573                                  MachineRegisterInfo &MRI,
3574                                  MachineInstr &Inst) const {
3575   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3576   MachineBasicBlock *MBB = Inst.getParent();
3577   MachineOperand &Src0 = Inst.getOperand(1);
3578   MachineOperand &Src1 = Inst.getOperand(2);
3579   const DebugLoc &DL = Inst.getDebugLoc();
3580
3581   switch (Inst.getOpcode()) {
3582   case AMDGPU::S_PACK_LL_B32_B16: {
3583     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3584     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3585
3586     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
3587     // 0.
3588     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3589       .addImm(0xffff);
3590
3591     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
3592       .addReg(ImmReg, RegState::Kill)
3593       .add(Src0);
3594
3595     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
3596       .add(Src1)
3597       .addImm(16)
3598       .addReg(TmpReg, RegState::Kill);
3599     break;
3600   }
3601   case AMDGPU::S_PACK_LH_B32_B16: {
3602     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3603     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3604       .addImm(0xffff);
3605     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
3606       .addReg(ImmReg, RegState::Kill)
3607       .add(Src0)
3608       .add(Src1);
3609     break;
3610   }
3611   case AMDGPU::S_PACK_HH_B32_B16: {
3612     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3613     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3614     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
3615       .addImm(16)
3616       .add(Src0);
3617     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3618       .addImm(0xffff0000);
3619     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
3620       .add(Src1)
3621       .addReg(ImmReg, RegState::Kill)
3622       .addReg(TmpReg, RegState::Kill);
3623     break;
3624   }
3625   default:
3626     llvm_unreachable("unhandled s_pack_* instruction");
3627   }
3628
3629   MachineOperand &Dest = Inst.getOperand(0);
3630   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3631   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3632 }
3633
3634 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
3635     MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
3636   // This assumes that all the users of SCC are in the same block
3637   // as the SCC def.
3638   for (MachineInstr &MI :
3639        llvm::make_range(MachineBasicBlock::iterator(SCCDefInst),
3640                         SCCDefInst.getParent()->end())) {
3641     // Exit if we find another SCC def.
3642     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
3643       return;
3644
3645     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
3646       Worklist.push_back(&MI);
3647   }
3648 }
3649
3650 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
3651   const MachineInstr &Inst) const {
3652   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
3653
3654   switch (Inst.getOpcode()) {
3655   // For target instructions, getOpRegClass just returns the virtual register
3656   // class associated with the operand, so we need to find an equivalent VGPR
3657   // register class in order to move the instruction to the VALU.
3658   case AMDGPU::COPY:
3659   case AMDGPU::PHI:
3660   case AMDGPU::REG_SEQUENCE:
3661   case AMDGPU::INSERT_SUBREG:
3662     if (RI.hasVGPRs(NewDstRC))
3663       return nullptr;
3664
3665     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
3666     if (!NewDstRC)
3667       return nullptr;
3668     return NewDstRC;
3669   default:
3670     return NewDstRC;
3671   }
3672 }
3673
3674 // Find the one SGPR operand we are allowed to use.
3675 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
3676                                    int OpIndices[3]) const {
3677   const MCInstrDesc &Desc = MI.getDesc();
3678
3679   // Find the one SGPR operand we are allowed to use.
3680   //
3681   // First we need to consider the instruction's operand requirements before
3682   // legalizing. Some operands are required to be SGPRs, such as implicit uses
3683   // of VCC, but we are still bound by the constant bus requirement to only use
3684   // one.
3685   //
3686   // If the operand's class is an SGPR, we can never move it.
3687
3688   unsigned SGPRReg = findImplicitSGPRRead(MI);
3689   if (SGPRReg != AMDGPU::NoRegister)
3690     return SGPRReg;
3691
3692   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
3693   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3694
3695   for (unsigned i = 0; i < 3; ++i) {
3696     int Idx = OpIndices[i];
3697     if (Idx == -1)
3698       break;
3699
3700     const MachineOperand &MO = MI.getOperand(Idx);
3701     if (!MO.isReg())
3702       continue;
3703
3704     // Is this operand statically required to be an SGPR based on the operand
3705     // constraints?
3706     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
3707     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
3708     if (IsRequiredSGPR)
3709       return MO.getReg();
3710
3711     // If this could be a VGPR or an SGPR, Check the dynamic register class.
3712     unsigned Reg = MO.getReg();
3713     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
3714     if (RI.isSGPRClass(RegRC))
3715       UsedSGPRs[i] = Reg;
3716   }
3717
3718   // We don't have a required SGPR operand, so we have a bit more freedom in
3719   // selecting operands to move.
3720
3721   // Try to select the most used SGPR. If an SGPR is equal to one of the
3722   // others, we choose that.
3723   //
3724   // e.g.
3725   // V_FMA_F32 v0, s0, s0, s0 -> No moves
3726   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
3727
3728   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
3729   // prefer those.
3730
3731   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
3732     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
3733       SGPRReg = UsedSGPRs[0];
3734   }
3735
3736   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
3737     if (UsedSGPRs[1] == UsedSGPRs[2])
3738       SGPRReg = UsedSGPRs[1];
3739   }
3740
3741   return SGPRReg;
3742 }
3743
3744 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
3745                                              unsigned OperandName) const {
3746   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
3747   if (Idx == -1)
3748     return nullptr;
3749
3750   return &MI.getOperand(Idx);
3751 }
3752
3753 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
3754   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
3755   if (ST.isAmdHsaOS()) {
3756     // Set ATC = 1. GFX9 doesn't have this bit.
3757     if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
3758       RsrcDataFormat |= (1ULL << 56);
3759
3760     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
3761     // BTW, it disables TC L2 and therefore decreases performance.
3762     if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
3763       RsrcDataFormat |= (2ULL << 59);
3764   }
3765
3766   return RsrcDataFormat;
3767 }
3768
3769 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
3770   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
3771                     AMDGPU::RSRC_TID_ENABLE |
3772                     0xffffffff; // Size;
3773
3774   // GFX9 doesn't have ELEMENT_SIZE.
3775   if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
3776     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
3777     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
3778   }
3779
3780   // IndexStride = 64.
3781   Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
3782
3783   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
3784   // Clear them unless we want a huge stride.
3785   if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
3786     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
3787
3788   return Rsrc23;
3789 }
3790
3791 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
3792   unsigned Opc = MI.getOpcode();
3793
3794   return isSMRD(Opc);
3795 }
3796
3797 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
3798   unsigned Opc = MI.getOpcode();
3799
3800   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
3801 }
3802
3803 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
3804                                     int &FrameIndex) const {
3805   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
3806   if (!Addr || !Addr->isFI())
3807     return AMDGPU::NoRegister;
3808
3809   assert(!MI.memoperands_empty() &&
3810          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
3811
3812   FrameIndex = Addr->getIndex();
3813   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
3814 }
3815
3816 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
3817                                         int &FrameIndex) const {
3818   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
3819   assert(Addr && Addr->isFI());
3820   FrameIndex = Addr->getIndex();
3821   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
3822 }
3823
3824 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
3825                                           int &FrameIndex) const {
3826
3827   if (!MI.mayLoad())
3828     return AMDGPU::NoRegister;
3829
3830   if (isMUBUF(MI) || isVGPRSpill(MI))
3831     return isStackAccess(MI, FrameIndex);
3832
3833   if (isSGPRSpill(MI))
3834     return isSGPRStackAccess(MI, FrameIndex);
3835
3836   return AMDGPU::NoRegister;
3837 }
3838
3839 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
3840                                          int &FrameIndex) const {
3841   if (!MI.mayStore())
3842     return AMDGPU::NoRegister;
3843
3844   if (isMUBUF(MI) || isVGPRSpill(MI))
3845     return isStackAccess(MI, FrameIndex);
3846
3847   if (isSGPRSpill(MI))
3848     return isSGPRStackAccess(MI, FrameIndex);
3849
3850   return AMDGPU::NoRegister;
3851 }
3852
3853 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
3854   unsigned Opc = MI.getOpcode();
3855   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
3856   unsigned DescSize = Desc.getSize();
3857
3858   // If we have a definitive size, we can use it. Otherwise we need to inspect
3859   // the operands to know the size.
3860   //
3861   // FIXME: Instructions that have a base 32-bit encoding report their size as
3862   // 4, even though they are really 8 bytes if they have a literal operand.
3863   if (DescSize != 0 && DescSize != 4)
3864     return DescSize;
3865
3866   // 4-byte instructions may have a 32-bit literal encoded after them. Check
3867   // operands that coud ever be literals.
3868   if (isVALU(MI) || isSALU(MI)) {
3869     if (isFixedSize(MI))
3870       return DescSize;
3871
3872     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3873     if (Src0Idx == -1)
3874       return 4; // No operands.
3875
3876     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
3877       return 8;
3878
3879     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3880     if (Src1Idx == -1)
3881       return 4;
3882
3883     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
3884       return 8;
3885
3886     return 4;
3887   }
3888
3889   if (DescSize == 4)
3890     return 4;
3891
3892   switch (Opc) {
3893   case TargetOpcode::IMPLICIT_DEF:
3894   case TargetOpcode::KILL:
3895   case TargetOpcode::DBG_VALUE:
3896   case TargetOpcode::BUNDLE:
3897   case TargetOpcode::EH_LABEL:
3898     return 0;
3899   case TargetOpcode::INLINEASM: {
3900     const MachineFunction *MF = MI.getParent()->getParent();
3901     const char *AsmStr = MI.getOperand(0).getSymbolName();
3902     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
3903   }
3904   default:
3905     llvm_unreachable("unable to find instruction size");
3906   }
3907 }
3908
3909 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
3910   if (!isFLAT(MI))
3911     return false;
3912
3913   if (MI.memoperands_empty())
3914     return true;
3915
3916   for (const MachineMemOperand *MMO : MI.memoperands()) {
3917     if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
3918       return true;
3919   }
3920   return false;
3921 }
3922
3923 ArrayRef<std::pair<int, const char *>>
3924 SIInstrInfo::getSerializableTargetIndices() const {
3925   static const std::pair<int, const char *> TargetIndices[] = {
3926       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
3927       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
3928       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
3929       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
3930       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
3931   return makeArrayRef(TargetIndices);
3932 }
3933
3934 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
3935 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
3936 ScheduleHazardRecognizer *
3937 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
3938                                             const ScheduleDAG *DAG) const {
3939   return new GCNHazardRecognizer(DAG->MF);
3940 }
3941
3942 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
3943 /// pass.
3944 ScheduleHazardRecognizer *
3945 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
3946   return new GCNHazardRecognizer(MF);
3947 }
3948
3949 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
3950   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
3951          MI.modifiesRegister(AMDGPU::EXEC, &RI);
3952 }
3953
3954 MachineInstrBuilder
3955 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
3956                            MachineBasicBlock::iterator I,
3957                            const DebugLoc &DL,
3958                            unsigned DestReg) const {
3959   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3960
3961   unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3962
3963   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
3964            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
3965 }