contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
  34                                        const R600Subtarget &STI)
  35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
  36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  37   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  39   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  40   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  41   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  42
  43   computeRegisterProperties(STI.getRegisterInfo());
  44
  45   // Legalize loads and stores to the private address space.
  46   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  47   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  48   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  49
  50   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
  51   // spaces, so it is custom lowered to handle those where it isn't.
  52   for (MVT VT : MVT::integer_valuetypes()) {
  53     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
  54     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
  55     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
  56
  57     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
  58     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
  59     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
  60
  61     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
  62     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
  63     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
  64   }
  65
  66   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
  67   setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
  68   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
  69   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
  70
  71   setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
  72   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
  73   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
  74
  75
  76   setOperationAction(ISD::STORE, MVT::i8, Custom);
  77   setOperationAction(ISD::STORE, MVT::i32, Custom);
  78   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  79   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  80
  81   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
  82   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
  83
  84   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
  85   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
  86   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
  87
  88   // Set condition code actions
  89   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  90   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  91   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  92   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  93   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  94   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  95   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  96   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  97   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  98   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  99   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
 100   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
 101
 102   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
 103   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
 104   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
 105   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
 106
 107   setOperationAction(ISD::FCOS, MVT::f32, Custom);
 108   setOperationAction(ISD::FSIN, MVT::f32, Custom);
 109
 110   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
 111   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
 112
 113   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
 114   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
 115   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 116
 117   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 118
 119   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
 120   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 121
 122   setOperationAction(ISD::SETCC, MVT::i32, Expand);
 123   setOperationAction(ISD::SETCC, MVT::f32, Expand);
 124   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
 125   setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
 126   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 127   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 128
 129   setOperationAction(ISD::SELECT, MVT::i32, Expand);
 130   setOperationAction(ISD::SELECT, MVT::f32, Expand);
 131   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
 132   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
 133
 134   // ADD, SUB overflow.
 135   // TODO: turn these into Legal?
 136   if (Subtarget->hasCARRY())
 137     setOperationAction(ISD::UADDO, MVT::i32, Custom);
 138
 139   if (Subtarget->hasBORROW())
 140     setOperationAction(ISD::USUBO, MVT::i32, Custom);
 141
 142   // Expand sign extension of vectors
 143   if (!Subtarget->hasBFE())
 144     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 145
 146   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
 147   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 148
 149   if (!Subtarget->hasBFE())
 150     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 151   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 152   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 153
 154   if (!Subtarget->hasBFE())
 155     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 156   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 157   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 158
 159   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 160   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 161   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 162
 163   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 164
 165   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 166
 167   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 168   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 169   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 170   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 171
 172   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 173   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 174   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 175   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 176
 177   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 178   //  to be Legal/Custom in order to avoid library calls.
 179   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 180   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 181   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 182
 183   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 184
 185   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 186   for (MVT VT : ScalarIntVTs) {
 187     setOperationAction(ISD::ADDC, VT, Expand);
 188     setOperationAction(ISD::SUBC, VT, Expand);
 189     setOperationAction(ISD::ADDE, VT, Expand);
 190     setOperationAction(ISD::SUBE, VT, Expand);
 191   }
 192
 193   setSchedulingPreference(Sched::Source);
 194
 195
 196   setTargetDAGCombine(ISD::FP_ROUND);
 197   setTargetDAGCombine(ISD::FP_TO_SINT);
 198   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 199   setTargetDAGCombine(ISD::SELECT_CC);
 200   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 201 }
 202
 203 const R600Subtarget *R600TargetLowering::getSubtarget() const {
 204   return static_cast<const R600Subtarget *>(Subtarget);
 205 }
 206
 207 static inline bool isEOP(MachineBasicBlock::iterator I) {
 208   return std::next(I)->getOpcode() == AMDGPU::RETURN;
 209 }
 210
 211 MachineBasicBlock *
 212 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 213                                                 MachineBasicBlock *BB) const {
 214   MachineFunction * MF = BB->getParent();
 215   MachineRegisterInfo &MRI = MF->getRegInfo();
 216   MachineBasicBlock::iterator I = MI;
 217   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
 218
 219   switch (MI.getOpcode()) {
 220   default:
 221     // Replace LDS_*_RET instruction that don't have any uses with the
 222     // equivalent LDS_*_NORET instruction.
 223     if (TII->isLDSRetInstr(MI.getOpcode())) {
 224       int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
 225       assert(DstIdx != -1);
 226       MachineInstrBuilder NewMI;
 227       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 228       //        LDS_1A2D support and remove this special case.
 229       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
 230           MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
 231         return BB;
 232
 233       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 234                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
 235       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
 236         NewMI.addOperand(MI.getOperand(i));
 237       }
 238     } else {
 239       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 240     }
 241     break;
 242   case AMDGPU::CLAMP_R600: {
 243     MachineInstr *NewMI = TII->buildDefaultInstruction(
 244         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
 245         MI.getOperand(1).getReg());
 246     TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
 247     break;
 248   }
 249
 250   case AMDGPU::FABS_R600: {
 251     MachineInstr *NewMI = TII->buildDefaultInstruction(
 252         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
 253         MI.getOperand(1).getReg());
 254     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
 255     break;
 256   }
 257
 258   case AMDGPU::FNEG_R600: {
 259     MachineInstr *NewMI = TII->buildDefaultInstruction(
 260         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
 261         MI.getOperand(1).getReg());
 262     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
 263     break;
 264   }
 265
 266   case AMDGPU::MASK_WRITE: {
 267     unsigned maskedRegister = MI.getOperand(0).getReg();
 268     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 269     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 270     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
 271     break;
 272   }
 273
 274   case AMDGPU::MOV_IMM_F32:
 275     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
 276                                                             .getFPImm()
 277                                                             ->getValueAPF()
 278                                                             .bitcastToAPInt()
 279                                                             .getZExtValue());
 280     break;
 281   case AMDGPU::MOV_IMM_I32:
 282     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
 283                      MI.getOperand(1).getImm());
 284     break;
 285   case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
 286     //TODO: Perhaps combine this instruction with the next if possible
 287     auto MIB = TII->buildDefaultInstruction(
 288         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
 289     int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
 290     //TODO: Ugh this is rather ugly
 291     MIB->getOperand(Idx) = MI.getOperand(1);
 292     break;
 293   }
 294   case AMDGPU::CONST_COPY: {
 295     MachineInstr *NewMI = TII->buildDefaultInstruction(
 296         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
 297     TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
 298                        MI.getOperand(1).getImm());
 299     break;
 300   }
 301
 302   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 303   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 304   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 305     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
 306         .addOperand(MI.getOperand(0))
 307         .addOperand(MI.getOperand(1))
 308         .addImm(isEOP(I)); // Set End of program bit
 309     break;
 310   }
 311   case AMDGPU::RAT_STORE_TYPED_eg: {
 312     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
 313         .addOperand(MI.getOperand(0))
 314         .addOperand(MI.getOperand(1))
 315         .addOperand(MI.getOperand(2))
 316         .addImm(isEOP(I)); // Set End of program bit
 317     break;
 318   }
 319
 320   case AMDGPU::TXD: {
 321     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 322     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 323     MachineOperand &RID = MI.getOperand(4);
 324     MachineOperand &SID = MI.getOperand(5);
 325     unsigned TextureId = MI.getOperand(6).getImm();
 326     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 327     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 328
 329     switch (TextureId) {
 330     case 5: // Rect
 331       CTX = CTY = 0;
 332       break;
 333     case 6: // Shadow1D
 334       SrcW = SrcZ;
 335       break;
 336     case 7: // Shadow2D
 337       SrcW = SrcZ;
 338       break;
 339     case 8: // ShadowRect
 340       CTX = CTY = 0;
 341       SrcW = SrcZ;
 342       break;
 343     case 9: // 1DArray
 344       SrcZ = SrcY;
 345       CTZ = 0;
 346       break;
 347     case 10: // 2DArray
 348       CTZ = 0;
 349       break;
 350     case 11: // Shadow1DArray
 351       SrcZ = SrcY;
 352       CTZ = 0;
 353       break;
 354     case 12: // Shadow2DArray
 355       CTZ = 0;
 356       break;
 357     }
 358     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
 359             T0)
 360         .addOperand(MI.getOperand(3))
 361         .addImm(SrcX)
 362         .addImm(SrcY)
 363         .addImm(SrcZ)
 364         .addImm(SrcW)
 365         .addImm(0)
 366         .addImm(0)
 367         .addImm(0)
 368         .addImm(0)
 369         .addImm(1)
 370         .addImm(2)
 371         .addImm(3)
 372         .addOperand(RID)
 373         .addOperand(SID)
 374         .addImm(CTX)
 375         .addImm(CTY)
 376         .addImm(CTZ)
 377         .addImm(CTW);
 378     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
 379             T1)
 380         .addOperand(MI.getOperand(2))
 381         .addImm(SrcX)
 382         .addImm(SrcY)
 383         .addImm(SrcZ)
 384         .addImm(SrcW)
 385         .addImm(0)
 386         .addImm(0)
 387         .addImm(0)
 388         .addImm(0)
 389         .addImm(1)
 390         .addImm(2)
 391         .addImm(3)
 392         .addOperand(RID)
 393         .addOperand(SID)
 394         .addImm(CTX)
 395         .addImm(CTY)
 396         .addImm(CTZ)
 397         .addImm(CTW);
 398     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 399         .addOperand(MI.getOperand(0))
 400         .addOperand(MI.getOperand(1))
 401         .addImm(SrcX)
 402         .addImm(SrcY)
 403         .addImm(SrcZ)
 404         .addImm(SrcW)
 405         .addImm(0)
 406         .addImm(0)
 407         .addImm(0)
 408         .addImm(0)
 409         .addImm(1)
 410         .addImm(2)
 411         .addImm(3)
 412         .addOperand(RID)
 413         .addOperand(SID)
 414         .addImm(CTX)
 415         .addImm(CTY)
 416         .addImm(CTZ)
 417         .addImm(CTW)
 418         .addReg(T0, RegState::Implicit)
 419         .addReg(T1, RegState::Implicit);
 420     break;
 421   }
 422
 423   case AMDGPU::TXD_SHADOW: {
 424     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 425     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 426     MachineOperand &RID = MI.getOperand(4);
 427     MachineOperand &SID = MI.getOperand(5);
 428     unsigned TextureId = MI.getOperand(6).getImm();
 429     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 430     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 431
 432     switch (TextureId) {
 433     case 5: // Rect
 434       CTX = CTY = 0;
 435       break;
 436     case 6: // Shadow1D
 437       SrcW = SrcZ;
 438       break;
 439     case 7: // Shadow2D
 440       SrcW = SrcZ;
 441       break;
 442     case 8: // ShadowRect
 443       CTX = CTY = 0;
 444       SrcW = SrcZ;
 445       break;
 446     case 9: // 1DArray
 447       SrcZ = SrcY;
 448       CTZ = 0;
 449       break;
 450     case 10: // 2DArray
 451       CTZ = 0;
 452       break;
 453     case 11: // Shadow1DArray
 454       SrcZ = SrcY;
 455       CTZ = 0;
 456       break;
 457     case 12: // Shadow2DArray
 458       CTZ = 0;
 459       break;
 460     }
 461
 462     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
 463             T0)
 464         .addOperand(MI.getOperand(3))
 465         .addImm(SrcX)
 466         .addImm(SrcY)
 467         .addImm(SrcZ)
 468         .addImm(SrcW)
 469         .addImm(0)
 470         .addImm(0)
 471         .addImm(0)
 472         .addImm(0)
 473         .addImm(1)
 474         .addImm(2)
 475         .addImm(3)
 476         .addOperand(RID)
 477         .addOperand(SID)
 478         .addImm(CTX)
 479         .addImm(CTY)
 480         .addImm(CTZ)
 481         .addImm(CTW);
 482     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
 483             T1)
 484         .addOperand(MI.getOperand(2))
 485         .addImm(SrcX)
 486         .addImm(SrcY)
 487         .addImm(SrcZ)
 488         .addImm(SrcW)
 489         .addImm(0)
 490         .addImm(0)
 491         .addImm(0)
 492         .addImm(0)
 493         .addImm(1)
 494         .addImm(2)
 495         .addImm(3)
 496         .addOperand(RID)
 497         .addOperand(SID)
 498         .addImm(CTX)
 499         .addImm(CTY)
 500         .addImm(CTZ)
 501         .addImm(CTW);
 502     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 503         .addOperand(MI.getOperand(0))
 504         .addOperand(MI.getOperand(1))
 505         .addImm(SrcX)
 506         .addImm(SrcY)
 507         .addImm(SrcZ)
 508         .addImm(SrcW)
 509         .addImm(0)
 510         .addImm(0)
 511         .addImm(0)
 512         .addImm(0)
 513         .addImm(1)
 514         .addImm(2)
 515         .addImm(3)
 516         .addOperand(RID)
 517         .addOperand(SID)
 518         .addImm(CTX)
 519         .addImm(CTY)
 520         .addImm(CTZ)
 521         .addImm(CTW)
 522         .addReg(T0, RegState::Implicit)
 523         .addReg(T1, RegState::Implicit);
 524     break;
 525   }
 526
 527   case AMDGPU::BRANCH:
 528     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 529         .addOperand(MI.getOperand(0));
 530     break;
 531
 532   case AMDGPU::BRANCH_COND_f32: {
 533     MachineInstr *NewMI =
 534         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 535                 AMDGPU::PREDICATE_BIT)
 536             .addOperand(MI.getOperand(1))
 537             .addImm(OPCODE_IS_NOT_ZERO)
 538             .addImm(0); // Flags
 539     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
 540     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 541         .addOperand(MI.getOperand(0))
 542         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 543     break;
 544   }
 545
 546   case AMDGPU::BRANCH_COND_i32: {
 547     MachineInstr *NewMI =
 548         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 549                 AMDGPU::PREDICATE_BIT)
 550             .addOperand(MI.getOperand(1))
 551             .addImm(OPCODE_IS_NOT_ZERO_INT)
 552             .addImm(0); // Flags
 553     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
 554     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 555         .addOperand(MI.getOperand(0))
 556         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 557     break;
 558   }
 559
 560   case AMDGPU::EG_ExportSwz:
 561   case AMDGPU::R600_ExportSwz: {
 562     // Instruction is left unmodified if its not the last one of its type
 563     bool isLastInstructionOfItsType = true;
 564     unsigned InstExportType = MI.getOperand(1).getImm();
 565     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 566          EndBlock = BB->end(); NextExportInst != EndBlock;
 567          NextExportInst = std::next(NextExportInst)) {
 568       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 569           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 570         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 571             .getImm();
 572         if (CurrentInstExportType == InstExportType) {
 573           isLastInstructionOfItsType = false;
 574           break;
 575         }
 576       }
 577     }
 578     bool EOP = isEOP(I);
 579     if (!EOP && !isLastInstructionOfItsType)
 580       return BB;
 581     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
 582     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
 583         .addOperand(MI.getOperand(0))
 584         .addOperand(MI.getOperand(1))
 585         .addOperand(MI.getOperand(2))
 586         .addOperand(MI.getOperand(3))
 587         .addOperand(MI.getOperand(4))
 588         .addOperand(MI.getOperand(5))
 589         .addOperand(MI.getOperand(6))
 590         .addImm(CfInst)
 591         .addImm(EOP);
 592     break;
 593   }
 594   case AMDGPU::RETURN: {
 595     // RETURN instructions must have the live-out registers as implicit uses,
 596     // otherwise they appear dead.
 597     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 598     MachineInstrBuilder MIB(*MF, MI);
 599     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 600       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 601     return BB;
 602   }
 603   }
 604
 605   MI.eraseFromParent();
 606   return BB;
 607 }
 608
 609 //===----------------------------------------------------------------------===//
 610 // Custom DAG Lowering Operations
 611 //===----------------------------------------------------------------------===//
 612
 613 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 614   MachineFunction &MF = DAG.getMachineFunction();
 615   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 616   switch (Op.getOpcode()) {
 617   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 618   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 619   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 620   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 621   case ISD::SRA_PARTS:
 622   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 623   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
 624   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
 625   case ISD::FCOS:
 626   case ISD::FSIN: return LowerTrig(Op, DAG);
 627   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 628   case ISD::STORE: return LowerSTORE(Op, DAG);
 629   case ISD::LOAD: {
 630     SDValue Result = LowerLOAD(Op, DAG);
 631     assert((!Result.getNode() ||
 632             Result.getNode()->getNumValues() == 2) &&
 633            "Load should return a value and a chain");
 634     return Result;
 635   }
 636
 637   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 638   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 639   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
 640   case ISD::INTRINSIC_VOID: {
 641     SDValue Chain = Op.getOperand(0);
 642     unsigned IntrinsicID =
 643                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 644     switch (IntrinsicID) {
 645     case AMDGPUIntrinsic::r600_store_swizzle: {
 646       SDLoc DL(Op);
 647       const SDValue Args[8] = {
 648         Chain,
 649         Op.getOperand(2), // Export Value
 650         Op.getOperand(3), // ArrayBase
 651         Op.getOperand(4), // Type
 652         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
 653         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
 654         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
 655         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
 656       };
 657       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
 658     }
 659
 660     // default for switch(IntrinsicID)
 661     default: break;
 662     }
 663     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 664     break;
 665   }
 666   case ISD::INTRINSIC_WO_CHAIN: {
 667     unsigned IntrinsicID =
 668                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 669     EVT VT = Op.getValueType();
 670     SDLoc DL(Op);
 671     switch(IntrinsicID) {
 672     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 673     case AMDGPUIntrinsic::r600_tex:
 674     case AMDGPUIntrinsic::r600_texc:
 675     case AMDGPUIntrinsic::r600_txl:
 676     case AMDGPUIntrinsic::r600_txlc:
 677     case AMDGPUIntrinsic::r600_txb:
 678     case AMDGPUIntrinsic::r600_txbc:
 679     case AMDGPUIntrinsic::r600_txf:
 680     case AMDGPUIntrinsic::r600_txq:
 681     case AMDGPUIntrinsic::r600_ddx:
 682     case AMDGPUIntrinsic::r600_ddy: {
 683       unsigned TextureOp;
 684       switch (IntrinsicID) {
 685       case AMDGPUIntrinsic::r600_tex:
 686         TextureOp = 0;
 687         break;
 688       case AMDGPUIntrinsic::r600_texc:
 689         TextureOp = 1;
 690         break;
 691       case AMDGPUIntrinsic::r600_txl:
 692         TextureOp = 2;
 693         break;
 694       case AMDGPUIntrinsic::r600_txlc:
 695         TextureOp = 3;
 696         break;
 697       case AMDGPUIntrinsic::r600_txb:
 698         TextureOp = 4;
 699         break;
 700       case AMDGPUIntrinsic::r600_txbc:
 701         TextureOp = 5;
 702         break;
 703       case AMDGPUIntrinsic::r600_txf:
 704         TextureOp = 6;
 705         break;
 706       case AMDGPUIntrinsic::r600_txq:
 707         TextureOp = 7;
 708         break;
 709       case AMDGPUIntrinsic::r600_ddx:
 710         TextureOp = 8;
 711         break;
 712       case AMDGPUIntrinsic::r600_ddy:
 713         TextureOp = 9;
 714         break;
 715       default:
 716         llvm_unreachable("Unknow Texture Operation");
 717       }
 718
 719       SDValue TexArgs[19] = {
 720         DAG.getConstant(TextureOp, DL, MVT::i32),
 721         Op.getOperand(1),
 722         DAG.getConstant(0, DL, MVT::i32),
 723         DAG.getConstant(1, DL, MVT::i32),
 724         DAG.getConstant(2, DL, MVT::i32),
 725         DAG.getConstant(3, DL, MVT::i32),
 726         Op.getOperand(2),
 727         Op.getOperand(3),
 728         Op.getOperand(4),
 729         DAG.getConstant(0, DL, MVT::i32),
 730         DAG.getConstant(1, DL, MVT::i32),
 731         DAG.getConstant(2, DL, MVT::i32),
 732         DAG.getConstant(3, DL, MVT::i32),
 733         Op.getOperand(5),
 734         Op.getOperand(6),
 735         Op.getOperand(7),
 736         Op.getOperand(8),
 737         Op.getOperand(9),
 738         Op.getOperand(10)
 739       };
 740       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 741     }
 742     case AMDGPUIntrinsic::r600_dot4: {
 743       SDValue Args[8] = {
 744       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 745           DAG.getConstant(0, DL, MVT::i32)),
 746       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 747           DAG.getConstant(0, DL, MVT::i32)),
 748       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 749           DAG.getConstant(1, DL, MVT::i32)),
 750       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 751           DAG.getConstant(1, DL, MVT::i32)),
 752       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 753           DAG.getConstant(2, DL, MVT::i32)),
 754       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 755           DAG.getConstant(2, DL, MVT::i32)),
 756       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 757           DAG.getConstant(3, DL, MVT::i32)),
 758       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 759           DAG.getConstant(3, DL, MVT::i32))
 760       };
 761       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 762     }
 763
 764     case Intrinsic::r600_implicitarg_ptr: {
 765       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
 766       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
 767       return DAG.getConstant(ByteOffset, DL, PtrVT);
 768     }
 769     case Intrinsic::r600_read_ngroups_x:
 770       return LowerImplicitParameter(DAG, VT, DL, 0);
 771     case Intrinsic::r600_read_ngroups_y:
 772       return LowerImplicitParameter(DAG, VT, DL, 1);
 773     case Intrinsic::r600_read_ngroups_z:
 774       return LowerImplicitParameter(DAG, VT, DL, 2);
 775     case Intrinsic::r600_read_global_size_x:
 776       return LowerImplicitParameter(DAG, VT, DL, 3);
 777     case Intrinsic::r600_read_global_size_y:
 778       return LowerImplicitParameter(DAG, VT, DL, 4);
 779     case Intrinsic::r600_read_global_size_z:
 780       return LowerImplicitParameter(DAG, VT, DL, 5);
 781     case Intrinsic::r600_read_local_size_x:
 782       return LowerImplicitParameter(DAG, VT, DL, 6);
 783     case Intrinsic::r600_read_local_size_y:
 784       return LowerImplicitParameter(DAG, VT, DL, 7);
 785     case Intrinsic::r600_read_local_size_z:
 786       return LowerImplicitParameter(DAG, VT, DL, 8);
 787
 788     case Intrinsic::r600_read_workdim:
 789     case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
 790       uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
 791       return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
 792     }
 793
 794     case Intrinsic::r600_read_tgid_x:
 795       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 796                                   AMDGPU::T1_X, VT);
 797     case Intrinsic::r600_read_tgid_y:
 798       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 799                                   AMDGPU::T1_Y, VT);
 800     case Intrinsic::r600_read_tgid_z:
 801       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 802                                   AMDGPU::T1_Z, VT);
 803     case Intrinsic::r600_read_tidig_x:
 804       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 805                                   AMDGPU::T0_X, VT);
 806     case Intrinsic::r600_read_tidig_y:
 807       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 808                                   AMDGPU::T0_Y, VT);
 809     case Intrinsic::r600_read_tidig_z:
 810       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 811                                   AMDGPU::T0_Z, VT);
 812
 813     case Intrinsic::r600_recipsqrt_ieee:
 814       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
 815
 816     case Intrinsic::r600_recipsqrt_clamped:
 817       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
 818     }
 819
 820     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 821     break;
 822   }
 823   } // end switch(Op.getOpcode())
 824   return SDValue();
 825 }
 826
 827 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 828                                             SmallVectorImpl<SDValue> &Results,
 829                                             SelectionDAG &DAG) const {
 830   switch (N->getOpcode()) {
 831   default:
 832     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 833     return;
 834   case ISD::FP_TO_UINT:
 835     if (N->getValueType(0) == MVT::i1) {
 836       Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
 837       return;
 838     }
 839     // Fall-through. Since we don't care about out of bounds values
 840     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 841     // considers some extra cases which are not necessary here.
 842   case ISD::FP_TO_SINT: {
 843     if (N->getValueType(0) == MVT::i1) {
 844       Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
 845       return;
 846     }
 847
 848     SDValue Result;
 849     if (expandFP_TO_SINT(N, Result, DAG))
 850       Results.push_back(Result);
 851     return;
 852   }
 853   case ISD::SDIVREM: {
 854     SDValue Op = SDValue(N, 1);
 855     SDValue RES = LowerSDIVREM(Op, DAG);
 856     Results.push_back(RES);
 857     Results.push_back(RES.getValue(1));
 858     break;
 859   }
 860   case ISD::UDIVREM: {
 861     SDValue Op = SDValue(N, 0);
 862     LowerUDIVREM64(Op, DAG, Results);
 863     break;
 864   }
 865   }
 866 }
 867
 868 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 869                                                    SDValue Vector) const {
 870
 871   SDLoc DL(Vector);
 872   EVT VecVT = Vector.getValueType();
 873   EVT EltVT = VecVT.getVectorElementType();
 874   SmallVector<SDValue, 8> Args;
 875
 876   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 877                                                            i != e; ++i) {
 878     Args.push_back(DAG.getNode(
 879         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
 880         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
 881   }
 882
 883   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 884 }
 885
 886 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 887                                                     SelectionDAG &DAG) const {
 888
 889   SDLoc DL(Op);
 890   SDValue Vector = Op.getOperand(0);
 891   SDValue Index = Op.getOperand(1);
 892
 893   if (isa<ConstantSDNode>(Index) ||
 894       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 895     return Op;
 896
 897   Vector = vectorToVerticalVector(DAG, Vector);
 898   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 899                      Vector, Index);
 900 }
 901
 902 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 903                                                    SelectionDAG &DAG) const {
 904   SDLoc DL(Op);
 905   SDValue Vector = Op.getOperand(0);
 906   SDValue Value = Op.getOperand(1);
 907   SDValue Index = Op.getOperand(2);
 908
 909   if (isa<ConstantSDNode>(Index) ||
 910       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 911     return Op;
 912
 913   Vector = vectorToVerticalVector(DAG, Vector);
 914   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 915                                Vector, Value, Index);
 916   return vectorToVerticalVector(DAG, Insert);
 917 }
 918
 919 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
 920                                                SDValue Op,
 921                                                SelectionDAG &DAG) const {
 922
 923   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
 924   if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
 925     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 926
 927   const DataLayout &DL = DAG.getDataLayout();
 928   const GlobalValue *GV = GSD->getGlobal();
 929   MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
 930
 931   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
 932   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
 933 }
 934
 935 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 936   // On hw >= R700, COS/SIN input must be between -1. and 1.
 937   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 938   EVT VT = Op.getValueType();
 939   SDValue Arg = Op.getOperand(0);
 940   SDLoc DL(Op);
 941
 942   // TODO: Should this propagate fast-math-flags?
 943   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
 944       DAG.getNode(ISD::FADD, DL, VT,
 945         DAG.getNode(ISD::FMUL, DL, VT, Arg,
 946           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
 947         DAG.getConstantFP(0.5, DL, MVT::f32)));
 948   unsigned TrigNode;
 949   switch (Op.getOpcode()) {
 950   case ISD::FCOS:
 951     TrigNode = AMDGPUISD::COS_HW;
 952     break;
 953   case ISD::FSIN:
 954     TrigNode = AMDGPUISD::SIN_HW;
 955     break;
 956   default:
 957     llvm_unreachable("Wrong trig opcode");
 958   }
 959   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
 960       DAG.getNode(ISD::FADD, DL, VT, FractPart,
 961         DAG.getConstantFP(-0.5, DL, MVT::f32)));
 962   if (Gen >= R600Subtarget::R700)
 963     return TrigVal;
 964   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 965   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
 966       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
 967 }
 968
 969 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
 970   SDLoc DL(Op);
 971   EVT VT = Op.getValueType();
 972
 973   SDValue Lo = Op.getOperand(0);
 974   SDValue Hi = Op.getOperand(1);
 975   SDValue Shift = Op.getOperand(2);
 976   SDValue Zero = DAG.getConstant(0, DL, VT);
 977   SDValue One  = DAG.getConstant(1, DL, VT);
 978
 979   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
 980   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
 981   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
 982   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 983
 984   // The dance around Width1 is necessary for 0 special case.
 985   // Without it the CompShift might be 32, producing incorrect results in
 986   // Overflow. So we do the shift in two steps, the alternative is to
 987   // add a conditional to filter the special case.
 988
 989   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
 990   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
 991
 992   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
 993   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
 994   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
 995
 996   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
 997   SDValue LoBig = Zero;
 998
 999   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1000   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1001
1002   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1003 }
1004
1005 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1006   SDLoc DL(Op);
1007   EVT VT = Op.getValueType();
1008
1009   SDValue Lo = Op.getOperand(0);
1010   SDValue Hi = Op.getOperand(1);
1011   SDValue Shift = Op.getOperand(2);
1012   SDValue Zero = DAG.getConstant(0, DL, VT);
1013   SDValue One  = DAG.getConstant(1, DL, VT);
1014
1015   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1016
1017   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1018   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1019   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1020   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1021
1022   // The dance around Width1 is necessary for 0 special case.
1023   // Without it the CompShift might be 32, producing incorrect results in
1024   // Overflow. So we do the shift in two steps, the alternative is to
1025   // add a conditional to filter the special case.
1026
1027   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1028   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1029
1030   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1031   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1032   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1033
1034   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1035   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1036
1037   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1038   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1039
1040   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1041 }
1042
1043 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1044                                           unsigned mainop, unsigned ovf) const {
1045   SDLoc DL(Op);
1046   EVT VT = Op.getValueType();
1047
1048   SDValue Lo = Op.getOperand(0);
1049   SDValue Hi = Op.getOperand(1);
1050
1051   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1052   // Extend sign.
1053   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1054                     DAG.getValueType(MVT::i1));
1055
1056   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1057
1058   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1059 }
1060
1061 SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
1062   SDLoc DL(Op);
1063   return DAG.getNode(
1064       ISD::SETCC,
1065       DL,
1066       MVT::i1,
1067       Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
1068       DAG.getCondCode(ISD::SETEQ));
1069 }
1070
1071 SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
1072   SDLoc DL(Op);
1073   return DAG.getNode(
1074       ISD::SETCC,
1075       DL,
1076       MVT::i1,
1077       Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
1078       DAG.getCondCode(ISD::SETEQ));
1079 }
1080
1081 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1082                                                    const SDLoc &DL,
1083                                                    unsigned DwordOffset) const {
1084   unsigned ByteOffset = DwordOffset * 4;
1085   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1086                                       AMDGPUAS::CONSTANT_BUFFER_0);
1087
1088   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1089   assert(isInt<16>(ByteOffset));
1090
1091   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1092                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1093                      MachinePointerInfo(ConstantPointerNull::get(PtrType)));
1094 }
1095
1096 bool R600TargetLowering::isZero(SDValue Op) const {
1097   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1098     return Cst->isNullValue();
1099   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1100     return CstFP->isZero();
1101   } else {
1102     return false;
1103   }
1104 }
1105
1106 bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
1107   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1108     return CFP->isExactlyValue(1.0);
1109   }
1110   return isAllOnesConstant(Op);
1111 }
1112
1113 bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
1114   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1115     return CFP->getValueAPF().isZero();
1116   }
1117   return isNullConstant(Op);
1118 }
1119
1120 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1121   SDLoc DL(Op);
1122   EVT VT = Op.getValueType();
1123
1124   SDValue LHS = Op.getOperand(0);
1125   SDValue RHS = Op.getOperand(1);
1126   SDValue True = Op.getOperand(2);
1127   SDValue False = Op.getOperand(3);
1128   SDValue CC = Op.getOperand(4);
1129   SDValue Temp;
1130
1131   if (VT == MVT::f32) {
1132     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1133     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1134     if (MinMax)
1135       return MinMax;
1136   }
1137
1138   // LHS and RHS are guaranteed to be the same value type
1139   EVT CompareVT = LHS.getValueType();
1140
1141   // Check if we can lower this to a native operation.
1142
1143   // Try to lower to a SET* instruction:
1144   //
1145   // SET* can match the following patterns:
1146   //
1147   // select_cc f32, f32, -1,  0, cc_supported
1148   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1149   // select_cc i32, i32, -1,  0, cc_supported
1150   //
1151
1152   // Move hardware True/False values to the correct operand.
1153   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1154   ISD::CondCode InverseCC =
1155      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1156   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1157     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1158       std::swap(False, True);
1159       CC = DAG.getCondCode(InverseCC);
1160     } else {
1161       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1162       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1163         std::swap(False, True);
1164         std::swap(LHS, RHS);
1165         CC = DAG.getCondCode(SwapInvCC);
1166       }
1167     }
1168   }
1169
1170   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1171       (CompareVT == VT || VT == MVT::i32)) {
1172     // This can be matched by a SET* instruction.
1173     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1174   }
1175
1176   // Try to lower to a CND* instruction:
1177   //
1178   // CND* can match the following patterns:
1179   //
1180   // select_cc f32, 0.0, f32, f32, cc_supported
1181   // select_cc f32, 0.0, i32, i32, cc_supported
1182   // select_cc i32, 0,   f32, f32, cc_supported
1183   // select_cc i32, 0,   i32, i32, cc_supported
1184   //
1185
1186   // Try to move the zero value to the RHS
1187   if (isZero(LHS)) {
1188     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1189     // Try swapping the operands
1190     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1191     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1192       std::swap(LHS, RHS);
1193       CC = DAG.getCondCode(CCSwapped);
1194     } else {
1195       // Try inverting the conditon and then swapping the operands
1196       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1197       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1198       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1199         std::swap(True, False);
1200         std::swap(LHS, RHS);
1201         CC = DAG.getCondCode(CCSwapped);
1202       }
1203     }
1204   }
1205   if (isZero(RHS)) {
1206     SDValue Cond = LHS;
1207     SDValue Zero = RHS;
1208     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1209     if (CompareVT != VT) {
1210       // Bitcast True / False to the correct types.  This will end up being
1211       // a nop, but it allows us to define only a single pattern in the
1212       // .TD files for each CND* instruction rather than having to have
1213       // one pattern for integer True/False and one for fp True/False
1214       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1215       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1216     }
1217
1218     switch (CCOpcode) {
1219     case ISD::SETONE:
1220     case ISD::SETUNE:
1221     case ISD::SETNE:
1222       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1223       Temp = True;
1224       True = False;
1225       False = Temp;
1226       break;
1227     default:
1228       break;
1229     }
1230     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1231         Cond, Zero,
1232         True, False,
1233         DAG.getCondCode(CCOpcode));
1234     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1235   }
1236
1237   // If we make it this for it means we have no native instructions to handle
1238   // this SELECT_CC, so we must lower it.
1239   SDValue HWTrue, HWFalse;
1240
1241   if (CompareVT == MVT::f32) {
1242     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1243     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1244   } else if (CompareVT == MVT::i32) {
1245     HWTrue = DAG.getConstant(-1, DL, CompareVT);
1246     HWFalse = DAG.getConstant(0, DL, CompareVT);
1247   }
1248   else {
1249     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1250   }
1251
1252   // Lower this unsupported SELECT_CC into a combination of two supported
1253   // SELECT_CC operations.
1254   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1255
1256   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1257       Cond, HWFalse,
1258       True, False,
1259       DAG.getCondCode(ISD::SETNE));
1260 }
1261
1262 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1263 /// convert these pointers to a register index.  Each register holds
1264 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1265 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1266 /// for indirect addressing.
1267 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1268                                                unsigned StackWidth,
1269                                                SelectionDAG &DAG) const {
1270   unsigned SRLPad;
1271   switch(StackWidth) {
1272   case 1:
1273     SRLPad = 2;
1274     break;
1275   case 2:
1276     SRLPad = 3;
1277     break;
1278   case 4:
1279     SRLPad = 4;
1280     break;
1281   default: llvm_unreachable("Invalid stack width");
1282   }
1283
1284   SDLoc DL(Ptr);
1285   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1286                      DAG.getConstant(SRLPad, DL, MVT::i32));
1287 }
1288
1289 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1290                                          unsigned ElemIdx,
1291                                          unsigned &Channel,
1292                                          unsigned &PtrIncr) const {
1293   switch (StackWidth) {
1294   default:
1295   case 1:
1296     Channel = 0;
1297     if (ElemIdx > 0) {
1298       PtrIncr = 1;
1299     } else {
1300       PtrIncr = 0;
1301     }
1302     break;
1303   case 2:
1304     Channel = ElemIdx % 2;
1305     if (ElemIdx == 2) {
1306       PtrIncr = 1;
1307     } else {
1308       PtrIncr = 0;
1309     }
1310     break;
1311   case 4:
1312     Channel = ElemIdx;
1313     PtrIncr = 0;
1314     break;
1315   }
1316 }
1317
1318 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
1319                                                    SelectionDAG &DAG) const {
1320   SDLoc DL(Store);
1321
1322   unsigned Mask = 0;
1323   if (Store->getMemoryVT() == MVT::i8) {
1324     Mask = 0xff;
1325   } else if (Store->getMemoryVT() == MVT::i16) {
1326     Mask = 0xffff;
1327   }
1328
1329   SDValue Chain = Store->getChain();
1330   SDValue BasePtr = Store->getBasePtr();
1331   EVT MemVT = Store->getMemoryVT();
1332
1333   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
1334                             DAG.getConstant(2, DL, MVT::i32));
1335   SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
1336                             Chain, Ptr,
1337                             DAG.getTargetConstant(0, DL, MVT::i32));
1338
1339   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
1340                                 DAG.getConstant(0x3, DL, MVT::i32));
1341
1342   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1343                                  DAG.getConstant(3, DL, MVT::i32));
1344
1345   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1346                                   Store->getValue());
1347
1348   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1349
1350   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1351                                      MaskedValue, ShiftAmt);
1352
1353   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
1354                                 DAG.getConstant(Mask, DL, MVT::i32),
1355                                 ShiftAmt);
1356   DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
1357                         DAG.getConstant(0xffffffff, DL, MVT::i32));
1358   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1359
1360   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1361   return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1362                      Chain, Value, Ptr,
1363                      DAG.getTargetConstant(0, DL, MVT::i32));
1364 }
1365
1366 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1367   if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
1368     return Result;
1369
1370   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1371   unsigned AS = StoreNode->getAddressSpace();
1372   SDValue Value = StoreNode->getValue();
1373   EVT ValueVT = Value.getValueType();
1374
1375   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
1376       ValueVT.isVector()) {
1377     return SplitVectorStore(Op, DAG);
1378   }
1379
1380   SDLoc DL(Op);
1381   SDValue Chain = StoreNode->getChain();
1382   SDValue Ptr = StoreNode->getBasePtr();
1383
1384   if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1385     if (StoreNode->isTruncatingStore()) {
1386       EVT VT = Value.getValueType();
1387       assert(VT.bitsLE(MVT::i32));
1388       EVT MemVT = StoreNode->getMemoryVT();
1389       SDValue MaskConstant;
1390       if (MemVT == MVT::i8) {
1391         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1392       } else {
1393         assert(MemVT == MVT::i16);
1394         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1395       }
1396       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1397                                       DAG.getConstant(2, DL, MVT::i32));
1398       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1399                                       DAG.getConstant(0x00000003, DL, VT));
1400       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1401       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1402                                    DAG.getConstant(3, DL, VT));
1403       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1404       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1405       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1406       // vector instead.
1407       SDValue Src[4] = {
1408         ShiftedValue,
1409         DAG.getConstant(0, DL, MVT::i32),
1410         DAG.getConstant(0, DL, MVT::i32),
1411         Mask
1412       };
1413       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1414       SDValue Args[3] = { Chain, Input, DWordAddr };
1415       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1416                                      Op->getVTList(), Args, MemVT,
1417                                      StoreNode->getMemOperand());
1418     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1419                ValueVT.bitsGE(MVT::i32)) {
1420       // Convert pointer from byte address to dword address.
1421       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1422                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1423                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
1424
1425       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1426         llvm_unreachable("Truncated and indexed stores not supported yet");
1427       } else {
1428         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1429       }
1430       return Chain;
1431     }
1432   }
1433
1434   if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1435     return SDValue();
1436
1437   EVT MemVT = StoreNode->getMemoryVT();
1438   if (MemVT.bitsLT(MVT::i32))
1439     return lowerPrivateTruncStore(StoreNode, DAG);
1440
1441   // Lowering for indirect addressing
1442   const MachineFunction &MF = DAG.getMachineFunction();
1443   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1444   unsigned StackWidth = TFL->getStackWidth(MF);
1445
1446   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1447
1448   if (ValueVT.isVector()) {
1449     unsigned NumElemVT = ValueVT.getVectorNumElements();
1450     EVT ElemVT = ValueVT.getVectorElementType();
1451     SmallVector<SDValue, 4> Stores(NumElemVT);
1452
1453     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1454                                       "vector width in load");
1455
1456     for (unsigned i = 0; i < NumElemVT; ++i) {
1457       unsigned Channel, PtrIncr;
1458       getStackAddress(StackWidth, i, Channel, PtrIncr);
1459       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1460                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1461       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1462                                  Value, DAG.getConstant(i, DL, MVT::i32));
1463
1464       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1465                               Chain, Elem, Ptr,
1466                               DAG.getTargetConstant(Channel, DL, MVT::i32));
1467     }
1468      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1469    } else {
1470     if (ValueVT == MVT::i8) {
1471       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1472     }
1473     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1474     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1475   }
1476
1477   return Chain;
1478 }
1479
1480 // return (512 + (kc_bank << 12)
1481 static int
1482 ConstantAddressBlock(unsigned AddressSpace) {
1483   switch (AddressSpace) {
1484   case AMDGPUAS::CONSTANT_BUFFER_0:
1485     return 512;
1486   case AMDGPUAS::CONSTANT_BUFFER_1:
1487     return 512 + 4096;
1488   case AMDGPUAS::CONSTANT_BUFFER_2:
1489     return 512 + 4096 * 2;
1490   case AMDGPUAS::CONSTANT_BUFFER_3:
1491     return 512 + 4096 * 3;
1492   case AMDGPUAS::CONSTANT_BUFFER_4:
1493     return 512 + 4096 * 4;
1494   case AMDGPUAS::CONSTANT_BUFFER_5:
1495     return 512 + 4096 * 5;
1496   case AMDGPUAS::CONSTANT_BUFFER_6:
1497     return 512 + 4096 * 6;
1498   case AMDGPUAS::CONSTANT_BUFFER_7:
1499     return 512 + 4096 * 7;
1500   case AMDGPUAS::CONSTANT_BUFFER_8:
1501     return 512 + 4096 * 8;
1502   case AMDGPUAS::CONSTANT_BUFFER_9:
1503     return 512 + 4096 * 9;
1504   case AMDGPUAS::CONSTANT_BUFFER_10:
1505     return 512 + 4096 * 10;
1506   case AMDGPUAS::CONSTANT_BUFFER_11:
1507     return 512 + 4096 * 11;
1508   case AMDGPUAS::CONSTANT_BUFFER_12:
1509     return 512 + 4096 * 12;
1510   case AMDGPUAS::CONSTANT_BUFFER_13:
1511     return 512 + 4096 * 13;
1512   case AMDGPUAS::CONSTANT_BUFFER_14:
1513     return 512 + 4096 * 14;
1514   case AMDGPUAS::CONSTANT_BUFFER_15:
1515     return 512 + 4096 * 15;
1516   default:
1517     return -1;
1518   }
1519 }
1520
1521 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1522                                                 SelectionDAG &DAG) const {
1523   SDLoc DL(Op);
1524   LoadSDNode *Load = cast<LoadSDNode>(Op);
1525   ISD::LoadExtType ExtType = Load->getExtensionType();
1526   EVT MemVT = Load->getMemoryVT();
1527
1528   // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
1529   // register (2-)byte extract.
1530
1531   // Get Register holding the target.
1532   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
1533                             DAG.getConstant(2, DL, MVT::i32));
1534   // Load the Register.
1535   SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
1536                             Load->getChain(),
1537                             Ptr,
1538                             DAG.getTargetConstant(0, DL, MVT::i32),
1539                             Op.getOperand(2));
1540
1541   // Get offset within the register.
1542   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1543                                 Load->getBasePtr(),
1544                                 DAG.getConstant(0x3, DL, MVT::i32));
1545
1546   // Bit offset of target byte (byteIdx * 8).
1547   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1548                                  DAG.getConstant(3, DL, MVT::i32));
1549
1550   // Shift to the right.
1551   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
1552
1553   // Eliminate the upper bits by setting them to ...
1554   EVT MemEltVT = MemVT.getScalarType();
1555
1556   // ... ones.
1557   if (ExtType == ISD::SEXTLOAD) {
1558     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1559
1560     SDValue Ops[] = {
1561       DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
1562       Load->getChain()
1563     };
1564
1565     return DAG.getMergeValues(Ops, DL);
1566   }
1567
1568   // ... or zeros.
1569   SDValue Ops[] = {
1570     DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
1571     Load->getChain()
1572   };
1573
1574   return DAG.getMergeValues(Ops, DL);
1575 }
1576
1577 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1578   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1579   unsigned AS = LoadNode->getAddressSpace();
1580   EVT MemVT = LoadNode->getMemoryVT();
1581   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1582
1583   if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1584       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1585     return lowerPrivateExtLoad(Op, DAG);
1586   }
1587
1588   SDLoc DL(Op);
1589   EVT VT = Op.getValueType();
1590   SDValue Chain = LoadNode->getChain();
1591   SDValue Ptr = LoadNode->getBasePtr();
1592
1593   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1594     SDValue MergedValues[2] = {
1595       scalarizeVectorLoad(LoadNode, DAG),
1596       Chain
1597     };
1598     return DAG.getMergeValues(MergedValues, DL);
1599   }
1600
1601   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1602   if (ConstantBlock > -1 &&
1603       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1604        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1605     SDValue Result;
1606     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1607         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1608         isa<ConstantSDNode>(Ptr)) {
1609       SDValue Slots[4];
1610       for (unsigned i = 0; i < 4; i++) {
1611         // We want Const position encoded with the following formula :
1612         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1613         // const_index is Ptr computed by llvm using an alignment of 16.
1614         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1615         // then div by 4 at the ISel step
1616         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1617             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1618         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1619       }
1620       EVT NewVT = MVT::v4i32;
1621       unsigned NumElements = 4;
1622       if (VT.isVector()) {
1623         NewVT = VT;
1624         NumElements = VT.getVectorNumElements();
1625       }
1626       Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
1627     } else {
1628       // non-constant ptr can't be folded, keeps it as a v4f32 load
1629       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1630           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1631                       DAG.getConstant(4, DL, MVT::i32)),
1632                       DAG.getConstant(LoadNode->getAddressSpace() -
1633                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1634           );
1635     }
1636
1637     if (!VT.isVector()) {
1638       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1639                            DAG.getConstant(0, DL, MVT::i32));
1640     }
1641
1642     SDValue MergedValues[2] = {
1643       Result,
1644       Chain
1645     };
1646     return DAG.getMergeValues(MergedValues, DL);
1647   }
1648
1649   SDValue LoweredLoad;
1650
1651   // For most operations returning SDValue() will result in the node being
1652   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1653   // need to manually expand loads that may be legal in some address spaces and
1654   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1655   // compute shaders, since the data is sign extended when it is uploaded to the
1656   // buffer. However SEXT loads from other address spaces are not supported, so
1657   // we need to expand them here.
1658   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1659     EVT MemVT = LoadNode->getMemoryVT();
1660     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1661     SDValue NewLoad = DAG.getExtLoad(
1662         ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
1663         LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
1664     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1665                               DAG.getValueType(MemVT));
1666
1667     SDValue MergedValues[2] = { Res, Chain };
1668     return DAG.getMergeValues(MergedValues, DL);
1669   }
1670
1671   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1672     return SDValue();
1673   }
1674
1675   // Lowering for indirect addressing
1676   const MachineFunction &MF = DAG.getMachineFunction();
1677   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1678   unsigned StackWidth = TFL->getStackWidth(MF);
1679
1680   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1681
1682   if (VT.isVector()) {
1683     unsigned NumElemVT = VT.getVectorNumElements();
1684     EVT ElemVT = VT.getVectorElementType();
1685     SDValue Loads[4];
1686
1687     assert(NumElemVT <= 4);
1688     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1689                                       "vector width in load");
1690
1691     for (unsigned i = 0; i < NumElemVT; ++i) {
1692       unsigned Channel, PtrIncr;
1693       getStackAddress(StackWidth, i, Channel, PtrIncr);
1694       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1695                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1696       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1697                              Chain, Ptr,
1698                              DAG.getTargetConstant(Channel, DL, MVT::i32),
1699                              Op.getOperand(2));
1700     }
1701     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
1702     LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
1703   } else {
1704     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1705                               Chain, Ptr,
1706                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1707                               Op.getOperand(2));
1708   }
1709
1710   SDValue Ops[2] = {
1711     LoweredLoad,
1712     Chain
1713   };
1714
1715   return DAG.getMergeValues(Ops, DL);
1716 }
1717
1718 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1719   SDValue Chain = Op.getOperand(0);
1720   SDValue Cond  = Op.getOperand(1);
1721   SDValue Jump  = Op.getOperand(2);
1722
1723   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1724                      Chain, Jump, Cond);
1725 }
1726
1727 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1728                                             SelectionDAG &DAG) const {
1729   MachineFunction &MF = DAG.getMachineFunction();
1730   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1731
1732   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
1733
1734   unsigned FrameIndex = FIN->getIndex();
1735   unsigned IgnoredFrameReg;
1736   unsigned Offset =
1737     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
1738   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
1739                          Op.getValueType());
1740 }
1741
1742 /// XXX Only kernel functions are supported, so we can assume for now that
1743 /// every function is a kernel function, but in the future we should use
1744 /// separate calling conventions for kernel and non-kernel functions.
1745 SDValue R600TargetLowering::LowerFormalArguments(
1746     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1747     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1748     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1749   SmallVector<CCValAssign, 16> ArgLocs;
1750   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1751                  *DAG.getContext());
1752   MachineFunction &MF = DAG.getMachineFunction();
1753   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1754
1755   SmallVector<ISD::InputArg, 8> LocalIns;
1756
1757   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1758
1759   AnalyzeFormalArguments(CCInfo, LocalIns);
1760
1761   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1762     CCValAssign &VA = ArgLocs[i];
1763     const ISD::InputArg &In = Ins[i];
1764     EVT VT = In.VT;
1765     EVT MemVT = VA.getLocVT();
1766     if (!VT.isVector() && MemVT.isVector()) {
1767       // Get load source type if scalarized.
1768       MemVT = MemVT.getVectorElementType();
1769     }
1770
1771     if (AMDGPU::isShader(CallConv)) {
1772       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1773       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1774       InVals.push_back(Register);
1775       continue;
1776     }
1777
1778     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1779                                           AMDGPUAS::CONSTANT_BUFFER_0);
1780
1781     // i64 isn't a legal type, so the register type used ends up as i32, which
1782     // isn't expected here. It attempts to create this sextload, but it ends up
1783     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1784     // for <1 x i64>.
1785
1786     // The first 36 bytes of the input buffer contains information about
1787     // thread group and global sizes.
1788     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1789     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1790       // FIXME: This should really check the extload type, but the handling of
1791       // extload vector parameters seems to be broken.
1792
1793       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1794       Ext = ISD::SEXTLOAD;
1795     }
1796
1797     // Compute the offset from the value.
1798     // XXX - I think PartOffset should give you this, but it seems to give the
1799     // size of the register which isn't useful.
1800
1801     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1802     unsigned PartOffset = VA.getLocMemOffset();
1803     unsigned Offset = 36 + VA.getLocMemOffset();
1804
1805     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1806     SDValue Arg = DAG.getLoad(
1807         ISD::UNINDEXED, Ext, VT, DL, Chain,
1808         DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
1809         MemVT, /* Alignment = */ 4,
1810         MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant);
1811
1812     // 4 is the preferred alignment for the CONSTANT memory space.
1813     InVals.push_back(Arg);
1814     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1815   }
1816   return Chain;
1817 }
1818
1819 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1820                                            EVT VT) const {
1821    if (!VT.isVector())
1822      return MVT::i32;
1823    return VT.changeVectorElementTypeToInteger();
1824 }
1825
1826 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1827                                                         unsigned AddrSpace,
1828                                                         unsigned Align,
1829                                                         bool *IsFast) const {
1830   if (IsFast)
1831     *IsFast = false;
1832
1833   if (!VT.isSimple() || VT == MVT::Other)
1834     return false;
1835
1836   if (VT.bitsLT(MVT::i32))
1837     return false;
1838
1839   // TODO: This is a rough estimate.
1840   if (IsFast)
1841     *IsFast = true;
1842
1843   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1844 }
1845
1846 static SDValue CompactSwizzlableVector(
1847   SelectionDAG &DAG, SDValue VectorEntry,
1848   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1849   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1850   assert(RemapSwizzle.empty());
1851   SDValue NewBldVec[4] = {
1852     VectorEntry.getOperand(0),
1853     VectorEntry.getOperand(1),
1854     VectorEntry.getOperand(2),
1855     VectorEntry.getOperand(3)
1856   };
1857
1858   for (unsigned i = 0; i < 4; i++) {
1859     if (NewBldVec[i].isUndef())
1860       // We mask write here to teach later passes that the ith element of this
1861       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1862       // break false dependencies and additionnaly make assembly easier to read.
1863       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1864     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1865       if (C->isZero()) {
1866         RemapSwizzle[i] = 4; // SEL_0
1867         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1868       } else if (C->isExactlyValue(1.0)) {
1869         RemapSwizzle[i] = 5; // SEL_1
1870         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1871       }
1872     }
1873
1874     if (NewBldVec[i].isUndef())
1875       continue;
1876     for (unsigned j = 0; j < i; j++) {
1877       if (NewBldVec[i] == NewBldVec[j]) {
1878         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1879         RemapSwizzle[i] = j;
1880         break;
1881       }
1882     }
1883   }
1884
1885   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1886                             NewBldVec);
1887 }
1888
1889 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1890                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1891   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1892   assert(RemapSwizzle.empty());
1893   SDValue NewBldVec[4] = {
1894       VectorEntry.getOperand(0),
1895       VectorEntry.getOperand(1),
1896       VectorEntry.getOperand(2),
1897       VectorEntry.getOperand(3)
1898   };
1899   bool isUnmovable[4] = { false, false, false, false };
1900   for (unsigned i = 0; i < 4; i++) {
1901     RemapSwizzle[i] = i;
1902     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1903       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1904           ->getZExtValue();
1905       if (i == Idx)
1906         isUnmovable[Idx] = true;
1907     }
1908   }
1909
1910   for (unsigned i = 0; i < 4; i++) {
1911     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1912       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1913           ->getZExtValue();
1914       if (isUnmovable[Idx])
1915         continue;
1916       // Swap i and Idx
1917       std::swap(NewBldVec[Idx], NewBldVec[i]);
1918       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1919       break;
1920     }
1921   }
1922
1923   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1924                             NewBldVec);
1925 }
1926
1927 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
1928                                             SelectionDAG &DAG,
1929                                             const SDLoc &DL) const {
1930   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1931   // Old -> New swizzle values
1932   DenseMap<unsigned, unsigned> SwizzleRemap;
1933
1934   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1935   for (unsigned i = 0; i < 4; i++) {
1936     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1937     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1938       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1939   }
1940
1941   SwizzleRemap.clear();
1942   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1943   for (unsigned i = 0; i < 4; i++) {
1944     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1945     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1946       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1947   }
1948
1949   return BuildVector;
1950 }
1951
1952
1953 //===----------------------------------------------------------------------===//
1954 // Custom DAG Optimizations
1955 //===----------------------------------------------------------------------===//
1956
1957 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1958                                               DAGCombinerInfo &DCI) const {
1959   SelectionDAG &DAG = DCI.DAG;
1960
1961   switch (N->getOpcode()) {
1962   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1963   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1964   case ISD::FP_ROUND: {
1965       SDValue Arg = N->getOperand(0);
1966       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1967         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1968                            Arg.getOperand(0));
1969       }
1970       break;
1971     }
1972
1973   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1974   // (i32 select_cc f32, f32, -1, 0 cc)
1975   //
1976   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1977   // this to one of the SET*_DX10 instructions.
1978   case ISD::FP_TO_SINT: {
1979     SDValue FNeg = N->getOperand(0);
1980     if (FNeg.getOpcode() != ISD::FNEG) {
1981       return SDValue();
1982     }
1983     SDValue SelectCC = FNeg.getOperand(0);
1984     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1985         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1986         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1987         !isHWTrueValue(SelectCC.getOperand(2)) ||
1988         !isHWFalseValue(SelectCC.getOperand(3))) {
1989       return SDValue();
1990     }
1991
1992     SDLoc dl(N);
1993     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1994                            SelectCC.getOperand(0), // LHS
1995                            SelectCC.getOperand(1), // RHS
1996                            DAG.getConstant(-1, dl, MVT::i32), // True
1997                            DAG.getConstant(0, dl, MVT::i32),  // False
1998                            SelectCC.getOperand(4)); // CC
1999
2000     break;
2001   }
2002
2003   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
2004   // => build_vector elt0, ... , NewEltIdx, ... , eltN
2005   case ISD::INSERT_VECTOR_ELT: {
2006     SDValue InVec = N->getOperand(0);
2007     SDValue InVal = N->getOperand(1);
2008     SDValue EltNo = N->getOperand(2);
2009     SDLoc dl(N);
2010
2011     // If the inserted element is an UNDEF, just use the input vector.
2012     if (InVal.isUndef())
2013       return InVec;
2014
2015     EVT VT = InVec.getValueType();
2016
2017     // If we can't generate a legal BUILD_VECTOR, exit
2018     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
2019       return SDValue();
2020
2021     // Check that we know which element is being inserted
2022     if (!isa<ConstantSDNode>(EltNo))
2023       return SDValue();
2024     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
2025
2026     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
2027     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
2028     // vector elements.
2029     SmallVector<SDValue, 8> Ops;
2030     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
2031       Ops.append(InVec.getNode()->op_begin(),
2032                  InVec.getNode()->op_end());
2033     } else if (InVec.isUndef()) {
2034       unsigned NElts = VT.getVectorNumElements();
2035       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
2036     } else {
2037       return SDValue();
2038     }
2039
2040     // Insert the element
2041     if (Elt < Ops.size()) {
2042       // All the operands of BUILD_VECTOR must have the same type;
2043       // we enforce that here.
2044       EVT OpVT = Ops[0].getValueType();
2045       if (InVal.getValueType() != OpVT)
2046         InVal = OpVT.bitsGT(InVal.getValueType()) ?
2047           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
2048           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
2049       Ops[Elt] = InVal;
2050     }
2051
2052     // Return the new vector
2053     return DAG.getBuildVector(VT, dl, Ops);
2054   }
2055
2056   // Extract_vec (Build_vector) generated by custom lowering
2057   // also needs to be customly combined
2058   case ISD::EXTRACT_VECTOR_ELT: {
2059     SDValue Arg = N->getOperand(0);
2060     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
2061       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2062         unsigned Element = Const->getZExtValue();
2063         return Arg->getOperand(Element);
2064       }
2065     }
2066     if (Arg.getOpcode() == ISD::BITCAST &&
2067         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
2068       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2069         unsigned Element = Const->getZExtValue();
2070         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
2071             Arg->getOperand(0).getOperand(Element));
2072       }
2073     }
2074     break;
2075   }
2076
2077   case ISD::SELECT_CC: {
2078     // Try common optimizations
2079     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
2080       return Ret;
2081
2082     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
2083     //      selectcc x, y, a, b, inv(cc)
2084     //
2085     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
2086     //      selectcc x, y, a, b, cc
2087     SDValue LHS = N->getOperand(0);
2088     if (LHS.getOpcode() != ISD::SELECT_CC) {
2089       return SDValue();
2090     }
2091
2092     SDValue RHS = N->getOperand(1);
2093     SDValue True = N->getOperand(2);
2094     SDValue False = N->getOperand(3);
2095     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2096
2097     if (LHS.getOperand(2).getNode() != True.getNode() ||
2098         LHS.getOperand(3).getNode() != False.getNode() ||
2099         RHS.getNode() != False.getNode()) {
2100       return SDValue();
2101     }
2102
2103     switch (NCC) {
2104     default: return SDValue();
2105     case ISD::SETNE: return LHS;
2106     case ISD::SETEQ: {
2107       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2108       LHSCC = ISD::getSetCCInverse(LHSCC,
2109                                   LHS.getOperand(0).getValueType().isInteger());
2110       if (DCI.isBeforeLegalizeOps() ||
2111           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2112         return DAG.getSelectCC(SDLoc(N),
2113                                LHS.getOperand(0),
2114                                LHS.getOperand(1),
2115                                LHS.getOperand(2),
2116                                LHS.getOperand(3),
2117                                LHSCC);
2118       break;
2119     }
2120     }
2121     return SDValue();
2122   }
2123
2124   case AMDGPUISD::EXPORT: {
2125     SDValue Arg = N->getOperand(1);
2126     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2127       break;
2128
2129     SDValue NewArgs[8] = {
2130       N->getOperand(0), // Chain
2131       SDValue(),
2132       N->getOperand(2), // ArrayBase
2133       N->getOperand(3), // Type
2134       N->getOperand(4), // SWZ_X
2135       N->getOperand(5), // SWZ_Y
2136       N->getOperand(6), // SWZ_Z
2137       N->getOperand(7) // SWZ_W
2138     };
2139     SDLoc DL(N);
2140     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2141     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2142   }
2143   case AMDGPUISD::TEXTURE_FETCH: {
2144     SDValue Arg = N->getOperand(1);
2145     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2146       break;
2147
2148     SDValue NewArgs[19] = {
2149       N->getOperand(0),
2150       N->getOperand(1),
2151       N->getOperand(2),
2152       N->getOperand(3),
2153       N->getOperand(4),
2154       N->getOperand(5),
2155       N->getOperand(6),
2156       N->getOperand(7),
2157       N->getOperand(8),
2158       N->getOperand(9),
2159       N->getOperand(10),
2160       N->getOperand(11),
2161       N->getOperand(12),
2162       N->getOperand(13),
2163       N->getOperand(14),
2164       N->getOperand(15),
2165       N->getOperand(16),
2166       N->getOperand(17),
2167       N->getOperand(18),
2168     };
2169     SDLoc DL(N);
2170     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2171     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2172   }
2173   }
2174
2175   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2176 }
2177
2178 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
2179                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
2180                                      SDValue &Sel, SDValue &Imm,
2181                                      SelectionDAG &DAG) const {
2182   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2183   if (!Src.isMachineOpcode())
2184     return false;
2185
2186   switch (Src.getMachineOpcode()) {
2187   case AMDGPU::FNEG_R600:
2188     if (!Neg.getNode())
2189       return false;
2190     Src = Src.getOperand(0);
2191     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2192     return true;
2193   case AMDGPU::FABS_R600:
2194     if (!Abs.getNode())
2195       return false;
2196     Src = Src.getOperand(0);
2197     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2198     return true;
2199   case AMDGPU::CONST_COPY: {
2200     unsigned Opcode = ParentNode->getMachineOpcode();
2201     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2202
2203     if (!Sel.getNode())
2204       return false;
2205
2206     SDValue CstOffset = Src.getOperand(0);
2207     if (ParentNode->getValueType(0).isVector())
2208       return false;
2209
2210     // Gather constants values
2211     int SrcIndices[] = {
2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2215       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2216       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2219       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2220       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2221       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2222       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2223     };
2224     std::vector<unsigned> Consts;
2225     for (int OtherSrcIdx : SrcIndices) {
2226       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2227       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2228         continue;
2229       if (HasDst) {
2230         OtherSrcIdx--;
2231         OtherSelIdx--;
2232       }
2233       if (RegisterSDNode *Reg =
2234           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2235         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2236           ConstantSDNode *Cst
2237             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2238           Consts.push_back(Cst->getZExtValue());
2239         }
2240       }
2241     }
2242
2243     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2244     Consts.push_back(Cst->getZExtValue());
2245     if (!TII->fitsConstReadLimitations(Consts)) {
2246       return false;
2247     }
2248
2249     Sel = CstOffset;
2250     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2251     return true;
2252   }
2253   case AMDGPU::MOV_IMM_GLOBAL_ADDR:
2254     // Check if the Imm slot is used. Taken from below.
2255     if (cast<ConstantSDNode>(Imm)->getZExtValue())
2256       return false;
2257     Imm = Src.getOperand(0);
2258     Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
2259     return true;
2260   case AMDGPU::MOV_IMM_I32:
2261   case AMDGPU::MOV_IMM_F32: {
2262     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2263     uint64_t ImmValue = 0;
2264
2265
2266     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2267       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2268       float FloatValue = FPC->getValueAPF().convertToFloat();
2269       if (FloatValue == 0.0) {
2270         ImmReg = AMDGPU::ZERO;
2271       } else if (FloatValue == 0.5) {
2272         ImmReg = AMDGPU::HALF;
2273       } else if (FloatValue == 1.0) {
2274         ImmReg = AMDGPU::ONE;
2275       } else {
2276         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2277       }
2278     } else {
2279       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2280       uint64_t Value = C->getZExtValue();
2281       if (Value == 0) {
2282         ImmReg = AMDGPU::ZERO;
2283       } else if (Value == 1) {
2284         ImmReg = AMDGPU::ONE_INT;
2285       } else {
2286         ImmValue = Value;
2287       }
2288     }
2289
2290     // Check that we aren't already using an immediate.
2291     // XXX: It's possible for an instruction to have more than one
2292     // immediate operand, but this is not supported yet.
2293     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2294       if (!Imm.getNode())
2295         return false;
2296       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2297       assert(C);
2298       if (C->getZExtValue())
2299         return false;
2300       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2301     }
2302     Src = DAG.getRegister(ImmReg, MVT::i32);
2303     return true;
2304   }
2305   default:
2306     return false;
2307   }
2308 }
2309
2310 /// \brief Fold the instructions after selecting them
2311 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2312                                             SelectionDAG &DAG) const {
2313   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2314   if (!Node->isMachineOpcode())
2315     return Node;
2316
2317   unsigned Opcode = Node->getMachineOpcode();
2318   SDValue FakeOp;
2319
2320   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2321
2322   if (Opcode == AMDGPU::DOT_4) {
2323     int OperandIdx[] = {
2324       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2325       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2326       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2327       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2328       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2329       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2330       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2331       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2332         };
2333     int NegIdx[] = {
2334       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2335       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2336       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2337       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2338       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2339       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2340       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2341       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2342     };
2343     int AbsIdx[] = {
2344       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2345       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2346       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2347       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2348       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2349       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2350       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2351       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2352     };
2353     for (unsigned i = 0; i < 8; i++) {
2354       if (OperandIdx[i] < 0)
2355         return Node;
2356       SDValue &Src = Ops[OperandIdx[i] - 1];
2357       SDValue &Neg = Ops[NegIdx[i] - 1];
2358       SDValue &Abs = Ops[AbsIdx[i] - 1];
2359       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2360       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2361       if (HasDst)
2362         SelIdx--;
2363       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2364       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2365         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2366     }
2367   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2368     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2369       SDValue &Src = Ops[i];
2370       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2371         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2372     }
2373   } else if (Opcode == AMDGPU::CLAMP_R600) {
2374     SDValue Src = Node->getOperand(0);
2375     if (!Src.isMachineOpcode() ||
2376         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2377       return Node;
2378     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2379         AMDGPU::OpName::clamp);
2380     if (ClampIdx < 0)
2381       return Node;
2382     SDLoc DL(Node);
2383     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2384     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2385     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2386                               Node->getVTList(), Ops);
2387   } else {
2388     if (!TII->hasInstrModifiers(Opcode))
2389       return Node;
2390     int OperandIdx[] = {
2391       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2392       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2393       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2394     };
2395     int NegIdx[] = {
2396       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2397       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2398       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2399     };
2400     int AbsIdx[] = {
2401       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2402       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2403       -1
2404     };
2405     for (unsigned i = 0; i < 3; i++) {
2406       if (OperandIdx[i] < 0)
2407         return Node;
2408       SDValue &Src = Ops[OperandIdx[i] - 1];
2409       SDValue &Neg = Ops[NegIdx[i] - 1];
2410       SDValue FakeAbs;
2411       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2412       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2413       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2414       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2415       if (HasDst) {
2416         SelIdx--;
2417         ImmIdx--;
2418       }
2419       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2420       SDValue &Imm = Ops[ImmIdx];
2421       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2422         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2423     }
2424   }
2425
2426   return Node;
2427 }