contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

   1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief This is the parent TargetLowering class for hardware code gen
  12 /// targets.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPUISelLowering.h"
  17 #include "AMDGPU.h"
  18 #include "AMDGPUCallLowering.h"
  19 #include "AMDGPUFrameLowering.h"
  20 #include "AMDGPUIntrinsicInfo.h"
  21 #include "AMDGPURegisterInfo.h"
  22 #include "AMDGPUSubtarget.h"
  23 #include "R600MachineFunctionInfo.h"
  24 #include "SIInstrInfo.h"
  25 #include "SIMachineFunctionInfo.h"
  26 #include "llvm/CodeGen/CallingConvLower.h"
  27 #include "llvm/CodeGen/MachineFunction.h"
  28 #include "llvm/CodeGen/MachineRegisterInfo.h"
  29 #include "llvm/CodeGen/SelectionDAG.h"
  30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  31 #include "llvm/IR/DataLayout.h"
  32 #include "llvm/IR/DiagnosticInfo.h"
  33 #include "llvm/Support/KnownBits.h"
  34 using namespace llvm;
  35
  36 static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
  37                             CCValAssign::LocInfo LocInfo,
  38                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
  39   MachineFunction &MF = State.getMachineFunction();
  40   AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
  41
  42   uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
  43                                          ArgFlags.getOrigAlign());
  44   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
  45   return true;
  46 }
  47
  48 static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
  49                            CCValAssign::LocInfo LocInfo,
  50                            ISD::ArgFlagsTy ArgFlags, CCState &State,
  51                            const TargetRegisterClass *RC,
  52                            unsigned NumRegs) {
  53   ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
  54   unsigned RegResult = State.AllocateReg(RegList);
  55   if (RegResult == AMDGPU::NoRegister)
  56     return false;
  57
  58   State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
  59   return true;
  60 }
  61
  62 static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
  63                               CCValAssign::LocInfo LocInfo,
  64                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
  65   switch (LocVT.SimpleTy) {
  66   case MVT::i64:
  67   case MVT::f64:
  68   case MVT::v2i32:
  69   case MVT::v2f32: {
  70     // Up to SGPR0-SGPR39
  71     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
  72                           &AMDGPU::SGPR_64RegClass, 20);
  73   }
  74   default:
  75     return false;
  76   }
  77 }
  78
  79 // Allocate up to VGPR31.
  80 //
  81 // TODO: Since there are no VGPR alignent requirements would it be better to
  82 // split into individual scalar registers?
  83 static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
  84                               CCValAssign::LocInfo LocInfo,
  85                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
  86   switch (LocVT.SimpleTy) {
  87   case MVT::i64:
  88   case MVT::f64:
  89   case MVT::v2i32:
  90   case MVT::v2f32: {
  91     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
  92                           &AMDGPU::VReg_64RegClass, 31);
  93   }
  94   case MVT::v4i32:
  95   case MVT::v4f32:
  96   case MVT::v2i64:
  97   case MVT::v2f64: {
  98     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
  99                           &AMDGPU::VReg_128RegClass, 29);
 100   }
 101   case MVT::v8i32:
 102   case MVT::v8f32: {
 103     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
 104                           &AMDGPU::VReg_256RegClass, 25);
 105
 106   }
 107   case MVT::v16i32:
 108   case MVT::v16f32: {
 109     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
 110                           &AMDGPU::VReg_512RegClass, 17);
 111
 112   }
 113   default:
 114     return false;
 115   }
 116 }
 117
 118 #include "AMDGPUGenCallingConv.inc"
 119
 120 // Find a larger type to do a load / store of a vector with.
 121 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
 122   unsigned StoreSize = VT.getStoreSizeInBits();
 123   if (StoreSize <= 32)
 124     return EVT::getIntegerVT(Ctx, StoreSize);
 125
 126   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
 127   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 128 }
 129
 130 bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op)
 131 {
 132   assert(Op.getOpcode() == ISD::OR);
 133
 134   SDValue N0 = Op->getOperand(0);
 135   SDValue N1 = Op->getOperand(1);
 136   EVT VT = N0.getValueType();
 137
 138   if (VT.isInteger() && !VT.isVector()) {
 139     KnownBits LHSKnown, RHSKnown;
 140     DAG.computeKnownBits(N0, LHSKnown);
 141
 142     if (LHSKnown.Zero.getBoolValue()) {
 143       DAG.computeKnownBits(N1, RHSKnown);
 144
 145       if (!(~RHSKnown.Zero & ~LHSKnown.Zero))
 146         return true;
 147     }
 148   }
 149
 150   return false;
 151 }
 152
 153 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 154                                            const AMDGPUSubtarget &STI)
 155     : TargetLowering(TM), Subtarget(&STI) {
 156   AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
 157   // Lower floating point store/load to integer store/load to reduce the number
 158   // of patterns in tablegen.
 159   setOperationAction(ISD::LOAD, MVT::f32, Promote);
 160   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 161
 162   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
 163   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
 164
 165   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
 166   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 167
 168   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
 169   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
 170
 171   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
 172   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 173
 174   setOperationAction(ISD::LOAD, MVT::i64, Promote);
 175   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
 176
 177   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
 178   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
 179
 180   setOperationAction(ISD::LOAD, MVT::f64, Promote);
 181   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
 182
 183   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
 184   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
 185
 186   // There are no 64-bit extloads. These should be done as a 32-bit extload and
 187   // an extension to 64-bit.
 188   for (MVT VT : MVT::integer_valuetypes()) {
 189     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
 190     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
 191     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
 192   }
 193
 194   for (MVT VT : MVT::integer_valuetypes()) {
 195     if (VT == MVT::i64)
 196       continue;
 197
 198     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 199     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
 200     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
 201     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
 202
 203     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 204     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
 205     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
 206     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
 207
 208     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 209     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
 210     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
 211     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
 212   }
 213
 214   for (MVT VT : MVT::integer_vector_valuetypes()) {
 215     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
 216     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
 217     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
 218     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
 219     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
 220     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
 221     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
 222     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
 223     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
 224     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
 225     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
 226     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
 227   }
 228
 229   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 230   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
 231   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
 232   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
 233
 234   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 235   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
 236   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
 237   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
 238
 239   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 240   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
 241   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
 242   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
 243
 244   setOperationAction(ISD::STORE, MVT::f32, Promote);
 245   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 246
 247   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
 248   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
 249
 250   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
 251   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 252
 253   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
 254   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
 255
 256   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
 257   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 258
 259   setOperationAction(ISD::STORE, MVT::i64, Promote);
 260   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
 261
 262   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
 263   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
 264
 265   setOperationAction(ISD::STORE, MVT::f64, Promote);
 266   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
 267
 268   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
 269   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
 270
 271   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
 272   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
 273   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 274   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 275
 276   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
 277   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
 278   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
 279   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
 280
 281   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 282   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
 283   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
 284   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
 285
 286   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 287   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 288
 289   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
 290   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 291
 292   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
 293   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 294
 295   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
 296   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 297
 298
 299   setOperationAction(ISD::Constant, MVT::i32, Legal);
 300   setOperationAction(ISD::Constant, MVT::i64, Legal);
 301   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 302   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 303
 304   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 305   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 306
 307   // This is totally unsupported, just custom lower to produce an error.
 308   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 309
 310   // Library functions.  These default to Expand, but we have instructions
 311   // for them.
 312   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
 313   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
 314   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
 315   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
 316   setOperationAction(ISD::FABS,   MVT::f32, Legal);
 317   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
 318   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
 319   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 320   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
 321   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
 322
 323   setOperationAction(ISD::FROUND, MVT::f32, Custom);
 324   setOperationAction(ISD::FROUND, MVT::f64, Custom);
 325
 326   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
 327   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
 328
 329   setOperationAction(ISD::FREM, MVT::f32, Custom);
 330   setOperationAction(ISD::FREM, MVT::f64, Custom);
 331
 332   // v_mad_f32 does not support denormals according to some sources.
 333   if (!Subtarget->hasFP32Denormals())
 334     setOperationAction(ISD::FMAD, MVT::f32, Legal);
 335
 336   // Expand to fneg + fadd.
 337   setOperationAction(ISD::FSUB, MVT::f64, Expand);
 338
 339   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
 340   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
 341   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
 342   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
 343   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
 344   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
 345   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
 346   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
 347   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
 348   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 349
 350   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
 351     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
 352     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
 353     setOperationAction(ISD::FRINT, MVT::f64, Custom);
 354     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
 355   }
 356
 357   if (!Subtarget->hasBFI()) {
 358     // fcopysign can be done in a single instruction with BFI.
 359     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 360     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 361   }
 362
 363   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 364   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
 365   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
 366
 367   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 368   for (MVT VT : ScalarIntVTs) {
 369     // These should use [SU]DIVREM, so set them to expand
 370     setOperationAction(ISD::SDIV, VT, Expand);
 371     setOperationAction(ISD::UDIV, VT, Expand);
 372     setOperationAction(ISD::SREM, VT, Expand);
 373     setOperationAction(ISD::UREM, VT, Expand);
 374
 375     // GPU does not have divrem function for signed or unsigned.
 376     setOperationAction(ISD::SDIVREM, VT, Custom);
 377     setOperationAction(ISD::UDIVREM, VT, Custom);
 378
 379     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
 380     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 381     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 382
 383     setOperationAction(ISD::BSWAP, VT, Expand);
 384     setOperationAction(ISD::CTTZ, VT, Expand);
 385     setOperationAction(ISD::CTLZ, VT, Expand);
 386   }
 387
 388   if (!Subtarget->hasBCNT(32))
 389     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 390
 391   if (!Subtarget->hasBCNT(64))
 392     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 393
 394   // The hardware supports 32-bit ROTR, but not ROTL.
 395   setOperationAction(ISD::ROTL, MVT::i32, Expand);
 396   setOperationAction(ISD::ROTL, MVT::i64, Expand);
 397   setOperationAction(ISD::ROTR, MVT::i64, Expand);
 398
 399   setOperationAction(ISD::MUL, MVT::i64, Expand);
 400   setOperationAction(ISD::MULHU, MVT::i64, Expand);
 401   setOperationAction(ISD::MULHS, MVT::i64, Expand);
 402   setOperationAction(ISD::UDIV, MVT::i32, Expand);
 403   setOperationAction(ISD::UREM, MVT::i32, Expand);
 404   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 405   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 406   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 407   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 408   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 409
 410   setOperationAction(ISD::SMIN, MVT::i32, Legal);
 411   setOperationAction(ISD::UMIN, MVT::i32, Legal);
 412   setOperationAction(ISD::SMAX, MVT::i32, Legal);
 413   setOperationAction(ISD::UMAX, MVT::i32, Legal);
 414
 415   if (Subtarget->hasFFBH())
 416     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
 417
 418   if (Subtarget->hasFFBL())
 419     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
 420
 421   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
 422   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 423
 424   // We only really have 32-bit BFE instructions (and 16-bit on VI).
 425   //
 426   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
 427   // effort to match them now. We want this to be false for i64 cases when the
 428   // extraction isn't restricted to the upper or lower half. Ideally we would
 429   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
 430   // span the midpoint are probably relatively rare, so don't worry about them
 431   // for now.
 432   if (Subtarget->hasBFE())
 433     setHasExtractBitsInsn(true);
 434
 435   static const MVT::SimpleValueType VectorIntTypes[] = {
 436     MVT::v2i32, MVT::v4i32
 437   };
 438
 439   for (MVT VT : VectorIntTypes) {
 440     // Expand the following operations for the current type by default.
 441     setOperationAction(ISD::ADD,  VT, Expand);
 442     setOperationAction(ISD::AND,  VT, Expand);
 443     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 444     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 445     setOperationAction(ISD::MUL,  VT, Expand);
 446     setOperationAction(ISD::MULHU, VT, Expand);
 447     setOperationAction(ISD::MULHS, VT, Expand);
 448     setOperationAction(ISD::OR,   VT, Expand);
 449     setOperationAction(ISD::SHL,  VT, Expand);
 450     setOperationAction(ISD::SRA,  VT, Expand);
 451     setOperationAction(ISD::SRL,  VT, Expand);
 452     setOperationAction(ISD::ROTL, VT, Expand);
 453     setOperationAction(ISD::ROTR, VT, Expand);
 454     setOperationAction(ISD::SUB,  VT, Expand);
 455     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 456     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 457     setOperationAction(ISD::SDIV, VT, Expand);
 458     setOperationAction(ISD::UDIV, VT, Expand);
 459     setOperationAction(ISD::SREM, VT, Expand);
 460     setOperationAction(ISD::UREM, VT, Expand);
 461     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 462     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 463     setOperationAction(ISD::SDIVREM, VT, Custom);
 464     setOperationAction(ISD::UDIVREM, VT, Expand);
 465     setOperationAction(ISD::ADDC, VT, Expand);
 466     setOperationAction(ISD::SUBC, VT, Expand);
 467     setOperationAction(ISD::ADDE, VT, Expand);
 468     setOperationAction(ISD::SUBE, VT, Expand);
 469     setOperationAction(ISD::SELECT, VT, Expand);
 470     setOperationAction(ISD::VSELECT, VT, Expand);
 471     setOperationAction(ISD::SELECT_CC, VT, Expand);
 472     setOperationAction(ISD::XOR,  VT, Expand);
 473     setOperationAction(ISD::BSWAP, VT, Expand);
 474     setOperationAction(ISD::CTPOP, VT, Expand);
 475     setOperationAction(ISD::CTTZ, VT, Expand);
 476     setOperationAction(ISD::CTLZ, VT, Expand);
 477     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 478   }
 479
 480   static const MVT::SimpleValueType FloatVectorTypes[] = {
 481     MVT::v2f32, MVT::v4f32
 482   };
 483
 484   for (MVT VT : FloatVectorTypes) {
 485     setOperationAction(ISD::FABS, VT, Expand);
 486     setOperationAction(ISD::FMINNUM, VT, Expand);
 487     setOperationAction(ISD::FMAXNUM, VT, Expand);
 488     setOperationAction(ISD::FADD, VT, Expand);
 489     setOperationAction(ISD::FCEIL, VT, Expand);
 490     setOperationAction(ISD::FCOS, VT, Expand);
 491     setOperationAction(ISD::FDIV, VT, Expand);
 492     setOperationAction(ISD::FEXP2, VT, Expand);
 493     setOperationAction(ISD::FLOG2, VT, Expand);
 494     setOperationAction(ISD::FREM, VT, Expand);
 495     setOperationAction(ISD::FPOW, VT, Expand);
 496     setOperationAction(ISD::FFLOOR, VT, Expand);
 497     setOperationAction(ISD::FTRUNC, VT, Expand);
 498     setOperationAction(ISD::FMUL, VT, Expand);
 499     setOperationAction(ISD::FMA, VT, Expand);
 500     setOperationAction(ISD::FRINT, VT, Expand);
 501     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 502     setOperationAction(ISD::FSQRT, VT, Expand);
 503     setOperationAction(ISD::FSIN, VT, Expand);
 504     setOperationAction(ISD::FSUB, VT, Expand);
 505     setOperationAction(ISD::FNEG, VT, Expand);
 506     setOperationAction(ISD::VSELECT, VT, Expand);
 507     setOperationAction(ISD::SELECT_CC, VT, Expand);
 508     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 509     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 510   }
 511
 512   // This causes using an unrolled select operation rather than expansion with
 513   // bit operations. This is in general better, but the alternative using BFI
 514   // instructions may be better if the select sources are SGPRs.
 515   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
 516   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
 517
 518   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
 519   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
 520
 521   // There are no libcalls of any kind.
 522   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
 523     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
 524
 525   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 526   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 527
 528   setSchedulingPreference(Sched::RegPressure);
 529   setJumpIsExpensive(true);
 530
 531   // FIXME: This is only partially true. If we have to do vector compares, any
 532   // SGPR pair can be a condition register. If we have a uniform condition, we
 533   // are better off doing SALU operations, where there is only one SCC. For now,
 534   // we don't have a way of knowing during instruction selection if a condition
 535   // will be uniform and we always use vector compares. Assume we are using
 536   // vector compares until that is fixed.
 537   setHasMultipleConditionRegisters(true);
 538
 539   // SI at least has hardware support for floating point exceptions, but no way
 540   // of using or handling them is implemented. They are also optional in OpenCL
 541   // (Section 7.3)
 542   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
 543
 544   PredictableSelectIsExpensive = false;
 545
 546   // We want to find all load dependencies for long chains of stores to enable
 547   // merging into very wide vectors. The problem is with vectors with > 4
 548   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
 549   // vectors are a legal type, even though we have to split the loads
 550   // usually. When we can more precisely specify load legality per address
 551   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
 552   // smarter so that they can figure out what to do in 2 iterations without all
 553   // N > 4 stores on the same chain.
 554   GatherAllAliasesMaxDepth = 16;
 555
 556   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
 557   // about these during lowering.
 558   MaxStoresPerMemcpy  = 0xffffffff;
 559   MaxStoresPerMemmove = 0xffffffff;
 560   MaxStoresPerMemset  = 0xffffffff;
 561
 562   setTargetDAGCombine(ISD::BITCAST);
 563   setTargetDAGCombine(ISD::SHL);
 564   setTargetDAGCombine(ISD::SRA);
 565   setTargetDAGCombine(ISD::SRL);
 566   setTargetDAGCombine(ISD::MUL);
 567   setTargetDAGCombine(ISD::MULHU);
 568   setTargetDAGCombine(ISD::MULHS);
 569   setTargetDAGCombine(ISD::SELECT);
 570   setTargetDAGCombine(ISD::SELECT_CC);
 571   setTargetDAGCombine(ISD::STORE);
 572   setTargetDAGCombine(ISD::FADD);
 573   setTargetDAGCombine(ISD::FSUB);
 574   setTargetDAGCombine(ISD::FNEG);
 575   setTargetDAGCombine(ISD::FABS);
 576 }
 577
 578 //===----------------------------------------------------------------------===//
 579 // Target Information
 580 //===----------------------------------------------------------------------===//
 581
 582 LLVM_READNONE
 583 static bool fnegFoldsIntoOp(unsigned Opc) {
 584   switch (Opc) {
 585   case ISD::FADD:
 586   case ISD::FSUB:
 587   case ISD::FMUL:
 588   case ISD::FMA:
 589   case ISD::FMAD:
 590   case ISD::FMINNUM:
 591   case ISD::FMAXNUM:
 592   case ISD::FSIN:
 593   case ISD::FTRUNC:
 594   case ISD::FRINT:
 595   case ISD::FNEARBYINT:
 596   case AMDGPUISD::RCP:
 597   case AMDGPUISD::RCP_LEGACY:
 598   case AMDGPUISD::SIN_HW:
 599   case AMDGPUISD::FMUL_LEGACY:
 600   case AMDGPUISD::FMIN_LEGACY:
 601   case AMDGPUISD::FMAX_LEGACY:
 602     return true;
 603   default:
 604     return false;
 605   }
 606 }
 607
 608 /// \p returns true if the operation will definitely need to use a 64-bit
 609 /// encoding, and thus will use a VOP3 encoding regardless of the source
 610 /// modifiers.
 611 LLVM_READONLY
 612 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
 613   return N->getNumOperands() > 2 || VT == MVT::f64;
 614 }
 615
 616 // Most FP instructions support source modifiers, but this could be refined
 617 // slightly.
 618 LLVM_READONLY
 619 static bool hasSourceMods(const SDNode *N) {
 620   if (isa<MemSDNode>(N))
 621     return false;
 622
 623   switch (N->getOpcode()) {
 624   case ISD::CopyToReg:
 625   case ISD::SELECT:
 626   case ISD::FDIV:
 627   case ISD::FREM:
 628   case ISD::INLINEASM:
 629   case AMDGPUISD::INTERP_P1:
 630   case AMDGPUISD::INTERP_P2:
 631   case AMDGPUISD::DIV_SCALE:
 632
 633   // TODO: Should really be looking at the users of the bitcast. These are
 634   // problematic because bitcasts are used to legalize all stores to integer
 635   // types.
 636   case ISD::BITCAST:
 637     return false;
 638   default:
 639     return true;
 640   }
 641 }
 642
 643 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
 644                                                  unsigned CostThreshold) {
 645   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
 646   // it is truly free to use a source modifier in all cases. If there are
 647   // multiple users but for each one will necessitate using VOP3, there will be
 648   // a code size increase. Try to avoid increasing code size unless we know it
 649   // will save on the instruction count.
 650   unsigned NumMayIncreaseSize = 0;
 651   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
 652
 653   // XXX - Should this limit number of uses to check?
 654   for (const SDNode *U : N->uses()) {
 655     if (!hasSourceMods(U))
 656       return false;
 657
 658     if (!opMustUseVOP3Encoding(U, VT)) {
 659       if (++NumMayIncreaseSize > CostThreshold)
 660         return false;
 661     }
 662   }
 663
 664   return true;
 665 }
 666
 667 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
 668   return MVT::i32;
 669 }
 670
 671 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
 672   return true;
 673 }
 674
 675 // The backend supports 32 and 64 bit floating point immediates.
 676 // FIXME: Why are we reporting vectors of FP immediates as legal?
 677 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 678   EVT ScalarVT = VT.getScalarType();
 679   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
 680          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
 681 }
 682
 683 // We don't want to shrink f64 / f32 constants.
 684 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
 685   EVT ScalarVT = VT.getScalarType();
 686   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
 687 }
 688
 689 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
 690                                                  ISD::LoadExtType,
 691                                                  EVT NewVT) const {
 692
 693   unsigned NewSize = NewVT.getStoreSizeInBits();
 694
 695   // If we are reducing to a 32-bit load, this is always better.
 696   if (NewSize == 32)
 697     return true;
 698
 699   EVT OldVT = N->getValueType(0);
 700   unsigned OldSize = OldVT.getStoreSizeInBits();
 701
 702   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
 703   // extloads, so doing one requires using a buffer_load. In cases where we
 704   // still couldn't use a scalar load, using the wider load shouldn't really
 705   // hurt anything.
 706
 707   // If the old size already had to be an extload, there's no harm in continuing
 708   // to reduce the width.
 709   return (OldSize < 32);
 710 }
 711
 712 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
 713                                                    EVT CastTy) const {
 714
 715   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
 716
 717   if (LoadTy.getScalarType() == MVT::i32)
 718     return false;
 719
 720   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
 721   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
 722
 723   return (LScalarSize < CastScalarSize) ||
 724          (CastScalarSize >= 32);
 725 }
 726
 727 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
 728 // profitable with the expansion for 64-bit since it's generally good to
 729 // speculate things.
 730 // FIXME: These should really have the size as a parameter.
 731 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
 732   return true;
 733 }
 734
 735 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
 736   return true;
 737 }
 738
 739 //===---------------------------------------------------------------------===//
 740 // Target Properties
 741 //===---------------------------------------------------------------------===//
 742
 743 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
 744   assert(VT.isFloatingPoint());
 745
 746   // Packed operations do not have a fabs modifier.
 747   return VT == MVT::f32 || VT == MVT::f64 ||
 748          (Subtarget->has16BitInsts() && VT == MVT::f16);
 749 }
 750
 751 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
 752   assert(VT.isFloatingPoint());
 753   return VT == MVT::f32 || VT == MVT::f64 ||
 754          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
 755          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
 756 }
 757
 758 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
 759                                                          unsigned NumElem,
 760                                                          unsigned AS) const {
 761   return true;
 762 }
 763
 764 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
 765   // There are few operations which truly have vector input operands. Any vector
 766   // operation is going to involve operations on each component, and a
 767   // build_vector will be a copy per element, so it always makes sense to use a
 768   // build_vector input in place of the extracted element to avoid a copy into a
 769   // super register.
 770   //
 771   // We should probably only do this if all users are extracts only, but this
 772   // should be the common case.
 773   return true;
 774 }
 775
 776 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
 777   // Truncate is just accessing a subregister.
 778
 779   unsigned SrcSize = Source.getSizeInBits();
 780   unsigned DestSize = Dest.getSizeInBits();
 781
 782   return DestSize < SrcSize && DestSize % 32 == 0 ;
 783 }
 784
 785 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
 786   // Truncate is just accessing a subregister.
 787
 788   unsigned SrcSize = Source->getScalarSizeInBits();
 789   unsigned DestSize = Dest->getScalarSizeInBits();
 790
 791   if (DestSize== 16 && Subtarget->has16BitInsts())
 792     return SrcSize >= 32;
 793
 794   return DestSize < SrcSize && DestSize % 32 == 0;
 795 }
 796
 797 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
 798   unsigned SrcSize = Src->getScalarSizeInBits();
 799   unsigned DestSize = Dest->getScalarSizeInBits();
 800
 801   if (SrcSize == 16 && Subtarget->has16BitInsts())
 802     return DestSize >= 32;
 803
 804   return SrcSize == 32 && DestSize == 64;
 805 }
 806
 807 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
 808   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
 809   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
 810   // this will enable reducing 64-bit operations the 32-bit, which is always
 811   // good.
 812
 813   if (Src == MVT::i16)
 814     return Dest == MVT::i32 ||Dest == MVT::i64 ;
 815
 816   return Src == MVT::i32 && Dest == MVT::i64;
 817 }
 818
 819 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
 820   return isZExtFree(Val.getValueType(), VT2);
 821 }
 822
 823 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
 824   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
 825   // limited number of native 64-bit operations. Shrinking an operation to fit
 826   // in a single 32-bit register should always be helpful. As currently used,
 827   // this is much less general than the name suggests, and is only used in
 828   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
 829   // not profitable, and may actually be harmful.
 830   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
 831 }
 832
 833 //===---------------------------------------------------------------------===//
 834 // TargetLowering Callbacks
 835 //===---------------------------------------------------------------------===//
 836
 837 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
 838                                                   bool IsVarArg) {
 839   switch (CC) {
 840   case CallingConv::AMDGPU_KERNEL:
 841   case CallingConv::SPIR_KERNEL:
 842     return CC_AMDGPU_Kernel;
 843   case CallingConv::AMDGPU_VS:
 844   case CallingConv::AMDGPU_GS:
 845   case CallingConv::AMDGPU_PS:
 846   case CallingConv::AMDGPU_CS:
 847   case CallingConv::AMDGPU_HS:
 848     return CC_AMDGPU;
 849   case CallingConv::C:
 850   case CallingConv::Fast:
 851     return CC_AMDGPU_Func;
 852   default:
 853     report_fatal_error("Unsupported calling convention.");
 854   }
 855 }
 856
 857 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 858                                                     bool IsVarArg) {
 859   switch (CC) {
 860   case CallingConv::AMDGPU_KERNEL:
 861   case CallingConv::SPIR_KERNEL:
 862     return CC_AMDGPU_Kernel;
 863   case CallingConv::AMDGPU_VS:
 864   case CallingConv::AMDGPU_GS:
 865   case CallingConv::AMDGPU_PS:
 866   case CallingConv::AMDGPU_CS:
 867   case CallingConv::AMDGPU_HS:
 868     return RetCC_SI_Shader;
 869   case CallingConv::C:
 870   case CallingConv::Fast:
 871     return RetCC_AMDGPU_Func;
 872   default:
 873     report_fatal_error("Unsupported calling convention.");
 874   }
 875 }
 876
 877 /// The SelectionDAGBuilder will automatically promote function arguments
 878 /// with illegal types.  However, this does not work for the AMDGPU targets
 879 /// since the function arguments are stored in memory as these illegal types.
 880 /// In order to handle this properly we need to get the original types sizes
 881 /// from the LLVM IR Function and fixup the ISD:InputArg values before
 882 /// passing them to AnalyzeFormalArguments()
 883
 884 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
 885 /// input values across multiple registers.  Each item in the Ins array
 886 /// represents a single value that will be stored in regsters.  Ins[x].VT is
 887 /// the value type of the value that will be stored in the register, so
 888 /// whatever SDNode we lower the argument to needs to be this type.
 889 ///
 890 /// In order to correctly lower the arguments we need to know the size of each
 891 /// argument.  Since Ins[x].VT gives us the size of the register that will
 892 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
 893 /// for the orignal function argument so that we can deduce the correct memory
 894 /// type to use for Ins[x].  In most cases the correct memory type will be
 895 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
 896 /// we have a kernel argument of type v8i8, this argument will be split into
 897 /// 8 parts and each part will be represented by its own item in the Ins array.
 898 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
 899 /// the argument before it was split.  From this, we deduce that the memory type
 900 /// for each individual part is i8.  We pass the memory type as LocVT to the
 901 /// calling convention analysis function and the register type (Ins[x].VT) as
 902 /// the ValVT.
 903 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
 904                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
 905   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
 906     const ISD::InputArg &In = Ins[i];
 907     EVT MemVT;
 908
 909     unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
 910
 911     if (!Subtarget->isAmdHsaOS() &&
 912         (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
 913       // The ABI says the caller will extend these values to 32-bits.
 914       MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
 915     } else if (NumRegs == 1) {
 916       // This argument is not split, so the IR type is the memory type.
 917       assert(!In.Flags.isSplit());
 918       if (In.ArgVT.isExtended()) {
 919         // We have an extended type, like i24, so we should just use the register type
 920         MemVT = In.VT;
 921       } else {
 922         MemVT = In.ArgVT;
 923       }
 924     } else if (In.ArgVT.isVector() && In.VT.isVector() &&
 925                In.ArgVT.getScalarType() == In.VT.getScalarType()) {
 926       assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
 927       // We have a vector value which has been split into a vector with
 928       // the same scalar type, but fewer elements.  This should handle
 929       // all the floating-point vector types.
 930       MemVT = In.VT;
 931     } else if (In.ArgVT.isVector() &&
 932                In.ArgVT.getVectorNumElements() == NumRegs) {
 933       // This arg has been split so that each element is stored in a separate
 934       // register.
 935       MemVT = In.ArgVT.getScalarType();
 936     } else if (In.ArgVT.isExtended()) {
 937       // We have an extended type, like i65.
 938       MemVT = In.VT;
 939     } else {
 940       unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
 941       assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
 942       if (In.VT.isInteger()) {
 943         MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
 944       } else if (In.VT.isVector()) {
 945         assert(!In.VT.getScalarType().isFloatingPoint());
 946         unsigned NumElements = In.VT.getVectorNumElements();
 947         assert(MemoryBits % NumElements == 0);
 948         // This vector type has been split into another vector type with
 949         // a different elements size.
 950         EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
 951                                          MemoryBits / NumElements);
 952         MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
 953       } else {
 954         llvm_unreachable("cannot deduce memory type.");
 955       }
 956     }
 957
 958     // Convert one element vectors to scalar.
 959     if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
 960       MemVT = MemVT.getScalarType();
 961
 962     if (MemVT.isExtended()) {
 963       // This should really only happen if we have vec3 arguments
 964       assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
 965       MemVT = MemVT.getPow2VectorType(State.getContext());
 966     }
 967
 968     assert(MemVT.isSimple());
 969     allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
 970                     State);
 971   }
 972 }
 973
 974 SDValue AMDGPUTargetLowering::LowerReturn(
 975   SDValue Chain, CallingConv::ID CallConv,
 976   bool isVarArg,
 977   const SmallVectorImpl<ISD::OutputArg> &Outs,
 978   const SmallVectorImpl<SDValue> &OutVals,
 979   const SDLoc &DL, SelectionDAG &DAG) const {
 980   // FIXME: Fails for r600 tests
 981   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
 982   // "wave terminate should not have return values");
 983   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
 984 }
 985
 986 //===---------------------------------------------------------------------===//
 987 // Target specific lowering
 988 //===---------------------------------------------------------------------===//
 989
 990 /// Selects the correct CCAssignFn for a given CallingConvention value.
 991 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
 992                                                     bool IsVarArg) {
 993   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
 994 }
 995
 996 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
 997                                                       bool IsVarArg) {
 998   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
 999 }
1000
1001 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1002                                         SmallVectorImpl<SDValue> &InVals) const {
1003   SDValue Callee = CLI.Callee;
1004   SelectionDAG &DAG = CLI.DAG;
1005
1006   const Function &Fn = *DAG.getMachineFunction().getFunction();
1007
1008   StringRef FuncName("<unknown>");
1009
1010   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1011     FuncName = G->getSymbol();
1012   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1013     FuncName = G->getGlobal()->getName();
1014
1015   DiagnosticInfoUnsupported NoCalls(
1016       Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
1017   DAG.getContext()->diagnose(NoCalls);
1018
1019   if (!CLI.IsTailCall) {
1020     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1021       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1022   }
1023
1024   return DAG.getEntryNode();
1025 }
1026
1027 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1028                                                       SelectionDAG &DAG) const {
1029   const Function &Fn = *DAG.getMachineFunction().getFunction();
1030
1031   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1032                                             SDLoc(Op).getDebugLoc());
1033   DAG.getContext()->diagnose(NoDynamicAlloca);
1034   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1035   return DAG.getMergeValues(Ops, SDLoc());
1036 }
1037
1038 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1039                                              SelectionDAG &DAG) const {
1040   switch (Op.getOpcode()) {
1041   default:
1042     Op->print(errs(), &DAG);
1043     llvm_unreachable("Custom lowering code for this"
1044                      "instruction is not implemented yet!");
1045     break;
1046   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1047   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1048   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1049   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1050   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1051   case ISD::FREM: return LowerFREM(Op, DAG);
1052   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1053   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1054   case ISD::FRINT: return LowerFRINT(Op, DAG);
1055   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1056   case ISD::FROUND: return LowerFROUND(Op, DAG);
1057   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1058   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1059   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1060   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1061   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1062   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1063   case ISD::CTLZ:
1064   case ISD::CTLZ_ZERO_UNDEF:
1065     return LowerCTLZ(Op, DAG);
1066   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1067   }
1068   return Op;
1069 }
1070
1071 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1072                                               SmallVectorImpl<SDValue> &Results,
1073                                               SelectionDAG &DAG) const {
1074   switch (N->getOpcode()) {
1075   case ISD::SIGN_EXTEND_INREG:
1076     // Different parts of legalization seem to interpret which type of
1077     // sign_extend_inreg is the one to check for custom lowering. The extended
1078     // from type is what really matters, but some places check for custom
1079     // lowering of the result type. This results in trying to use
1080     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1081     // nothing here and let the illegal result integer be handled normally.
1082     return;
1083   default:
1084     return;
1085   }
1086 }
1087
1088 static bool hasDefinedInitializer(const GlobalValue *GV) {
1089   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1090   if (!GVar || !GVar->hasInitializer())
1091     return false;
1092
1093   return !isa<UndefValue>(GVar->getInitializer());
1094 }
1095
1096 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1097                                                  SDValue Op,
1098                                                  SelectionDAG &DAG) const {
1099
1100   const DataLayout &DL = DAG.getDataLayout();
1101   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1102   const GlobalValue *GV = G->getGlobal();
1103
1104   if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
1105     // XXX: What does the value of G->getOffset() mean?
1106     assert(G->getOffset() == 0 &&
1107          "Do not know what to do with an non-zero offset");
1108
1109     // TODO: We could emit code to handle the initialization somewhere.
1110     if (!hasDefinedInitializer(GV)) {
1111       unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1112       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1113     }
1114   }
1115
1116   const Function &Fn = *DAG.getMachineFunction().getFunction();
1117   DiagnosticInfoUnsupported BadInit(
1118       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1119   DAG.getContext()->diagnose(BadInit);
1120   return SDValue();
1121 }
1122
1123 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1124                                                   SelectionDAG &DAG) const {
1125   SmallVector<SDValue, 8> Args;
1126
1127   for (const SDUse &U : Op->ops())
1128     DAG.ExtractVectorElements(U.get(), Args);
1129
1130   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1131 }
1132
1133 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1134                                                      SelectionDAG &DAG) const {
1135
1136   SmallVector<SDValue, 8> Args;
1137   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1138   EVT VT = Op.getValueType();
1139   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1140                             VT.getVectorNumElements());
1141
1142   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1143 }
1144
1145 /// \brief Generate Min/Max node
1146 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1147                                                    SDValue LHS, SDValue RHS,
1148                                                    SDValue True, SDValue False,
1149                                                    SDValue CC,
1150                                                    DAGCombinerInfo &DCI) const {
1151   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1152     return SDValue();
1153
1154   SelectionDAG &DAG = DCI.DAG;
1155   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1156   switch (CCOpcode) {
1157   case ISD::SETOEQ:
1158   case ISD::SETONE:
1159   case ISD::SETUNE:
1160   case ISD::SETNE:
1161   case ISD::SETUEQ:
1162   case ISD::SETEQ:
1163   case ISD::SETFALSE:
1164   case ISD::SETFALSE2:
1165   case ISD::SETTRUE:
1166   case ISD::SETTRUE2:
1167   case ISD::SETUO:
1168   case ISD::SETO:
1169     break;
1170   case ISD::SETULE:
1171   case ISD::SETULT: {
1172     if (LHS == True)
1173       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1174     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1175   }
1176   case ISD::SETOLE:
1177   case ISD::SETOLT:
1178   case ISD::SETLE:
1179   case ISD::SETLT: {
1180     // Ordered. Assume ordered for undefined.
1181
1182     // Only do this after legalization to avoid interfering with other combines
1183     // which might occur.
1184     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1185         !DCI.isCalledByLegalizer())
1186       return SDValue();
1187
1188     // We need to permute the operands to get the correct NaN behavior. The
1189     // selected operand is the second one based on the failing compare with NaN,
1190     // so permute it based on the compare type the hardware uses.
1191     if (LHS == True)
1192       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1193     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1194   }
1195   case ISD::SETUGE:
1196   case ISD::SETUGT: {
1197     if (LHS == True)
1198       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1199     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1200   }
1201   case ISD::SETGT:
1202   case ISD::SETGE:
1203   case ISD::SETOGE:
1204   case ISD::SETOGT: {
1205     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1206         !DCI.isCalledByLegalizer())
1207       return SDValue();
1208
1209     if (LHS == True)
1210       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1211     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1212   }
1213   case ISD::SETCC_INVALID:
1214     llvm_unreachable("Invalid setcc condcode!");
1215   }
1216   return SDValue();
1217 }
1218
1219 std::pair<SDValue, SDValue>
1220 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1221   SDLoc SL(Op);
1222
1223   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1224
1225   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1226   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1227
1228   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1229   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1230
1231   return std::make_pair(Lo, Hi);
1232 }
1233
1234 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1235   SDLoc SL(Op);
1236
1237   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1238   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1239   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1240 }
1241
1242 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1243   SDLoc SL(Op);
1244
1245   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1246   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1247   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1248 }
1249
1250 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1251                                               SelectionDAG &DAG) const {
1252   LoadSDNode *Load = cast<LoadSDNode>(Op);
1253   EVT VT = Op.getValueType();
1254
1255
1256   // If this is a 2 element vector, we really want to scalarize and not create
1257   // weird 1 element vectors.
1258   if (VT.getVectorNumElements() == 2)
1259     return scalarizeVectorLoad(Load, DAG);
1260
1261   SDValue BasePtr = Load->getBasePtr();
1262   EVT PtrVT = BasePtr.getValueType();
1263   EVT MemVT = Load->getMemoryVT();
1264   SDLoc SL(Op);
1265
1266   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1267
1268   EVT LoVT, HiVT;
1269   EVT LoMemVT, HiMemVT;
1270   SDValue Lo, Hi;
1271
1272   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1273   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1274   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
1275
1276   unsigned Size = LoMemVT.getStoreSize();
1277   unsigned BaseAlign = Load->getAlignment();
1278   unsigned HiAlign = MinAlign(BaseAlign, Size);
1279
1280   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1281                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1282                                   BaseAlign, Load->getMemOperand()->getFlags());
1283   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1284                               DAG.getConstant(Size, SL, PtrVT));
1285   SDValue HiLoad =
1286       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1287                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1288                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1289
1290   SDValue Ops[] = {
1291     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
1292     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1293                 LoLoad.getValue(1), HiLoad.getValue(1))
1294   };
1295
1296   return DAG.getMergeValues(Ops, SL);
1297 }
1298
1299 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1300                                                SelectionDAG &DAG) const {
1301   StoreSDNode *Store = cast<StoreSDNode>(Op);
1302   SDValue Val = Store->getValue();
1303   EVT VT = Val.getValueType();
1304
1305   // If this is a 2 element vector, we really want to scalarize and not create
1306   // weird 1 element vectors.
1307   if (VT.getVectorNumElements() == 2)
1308     return scalarizeVectorStore(Store, DAG);
1309
1310   EVT MemVT = Store->getMemoryVT();
1311   SDValue Chain = Store->getChain();
1312   SDValue BasePtr = Store->getBasePtr();
1313   SDLoc SL(Op);
1314
1315   EVT LoVT, HiVT;
1316   EVT LoMemVT, HiMemVT;
1317   SDValue Lo, Hi;
1318
1319   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1320   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1321   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
1322
1323   EVT PtrVT = BasePtr.getValueType();
1324   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1325                               DAG.getConstant(LoMemVT.getStoreSize(), SL,
1326                                               PtrVT));
1327
1328   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1329   unsigned BaseAlign = Store->getAlignment();
1330   unsigned Size = LoMemVT.getStoreSize();
1331   unsigned HiAlign = MinAlign(BaseAlign, Size);
1332
1333   SDValue LoStore =
1334       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1335                         Store->getMemOperand()->getFlags());
1336   SDValue HiStore =
1337       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1338                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1339
1340   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1341 }
1342
1343 // This is a shortcut for integer division because we have fast i32<->f32
1344 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1345 // float is enough to accurately represent up to a 24-bit signed integer.
1346 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1347                                             bool Sign) const {
1348   SDLoc DL(Op);
1349   EVT VT = Op.getValueType();
1350   SDValue LHS = Op.getOperand(0);
1351   SDValue RHS = Op.getOperand(1);
1352   MVT IntVT = MVT::i32;
1353   MVT FltVT = MVT::f32;
1354
1355   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1356   if (LHSSignBits < 9)
1357     return SDValue();
1358
1359   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1360   if (RHSSignBits < 9)
1361     return SDValue();
1362
1363   unsigned BitSize = VT.getSizeInBits();
1364   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1365   unsigned DivBits = BitSize - SignBits;
1366   if (Sign)
1367     ++DivBits;
1368
1369   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1370   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1371
1372   SDValue jq = DAG.getConstant(1, DL, IntVT);
1373
1374   if (Sign) {
1375     // char|short jq = ia ^ ib;
1376     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1377
1378     // jq = jq >> (bitsize - 2)
1379     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1380                      DAG.getConstant(BitSize - 2, DL, VT));
1381
1382     // jq = jq | 0x1
1383     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1384   }
1385
1386   // int ia = (int)LHS;
1387   SDValue ia = LHS;
1388
1389   // int ib, (int)RHS;
1390   SDValue ib = RHS;
1391
1392   // float fa = (float)ia;
1393   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1394
1395   // float fb = (float)ib;
1396   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1397
1398   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1399                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1400
1401   // fq = trunc(fq);
1402   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1403
1404   // float fqneg = -fq;
1405   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1406
1407   // float fr = mad(fqneg, fb, fa);
1408   unsigned OpCode = Subtarget->hasFP32Denormals() ?
1409                     (unsigned)AMDGPUISD::FMAD_FTZ :
1410                     (unsigned)ISD::FMAD;
1411   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1412
1413   // int iq = (int)fq;
1414   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1415
1416   // fr = fabs(fr);
1417   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1418
1419   // fb = fabs(fb);
1420   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1421
1422   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1423
1424   // int cv = fr >= fb;
1425   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1426
1427   // jq = (cv ? jq : 0);
1428   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1429
1430   // dst = iq + jq;
1431   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1432
1433   // Rem needs compensation, it's easier to recompute it
1434   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1435   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1436
1437   // Truncate to number of bits this divide really is.
1438   if (Sign) {
1439     SDValue InRegSize
1440       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1441     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1442     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1443   } else {
1444     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1445     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1446     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1447   }
1448
1449   return DAG.getMergeValues({ Div, Rem }, DL);
1450 }
1451
1452 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1453                                       SelectionDAG &DAG,
1454                                       SmallVectorImpl<SDValue> &Results) const {
1455   assert(Op.getValueType() == MVT::i64);
1456
1457   SDLoc DL(Op);
1458   EVT VT = Op.getValueType();
1459   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1460
1461   SDValue one = DAG.getConstant(1, DL, HalfVT);
1462   SDValue zero = DAG.getConstant(0, DL, HalfVT);
1463
1464   //HiLo split
1465   SDValue LHS = Op.getOperand(0);
1466   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
1467   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
1468
1469   SDValue RHS = Op.getOperand(1);
1470   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
1471   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
1472
1473   if (VT == MVT::i64 &&
1474     DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1475     DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1476
1477     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1478                               LHS_Lo, RHS_Lo);
1479
1480     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
1481     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
1482
1483     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1484     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1485     return;
1486   }
1487
1488   // Get Speculative values
1489   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1490   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1491
1492   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
1493   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
1494   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1495
1496   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
1497   SDValue DIV_Lo = zero;
1498
1499   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1500
1501   for (unsigned i = 0; i < halfBitWidth; ++i) {
1502     const unsigned bitPos = halfBitWidth - i - 1;
1503     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1504     // Get value of high bit
1505     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1506     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
1507     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1508
1509     // Shift
1510     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1511     // Add LHS high bit
1512     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1513
1514     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1515     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
1516
1517     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1518
1519     // Update REM
1520     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1521     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1522   }
1523
1524   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1525   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1526   Results.push_back(DIV);
1527   Results.push_back(REM);
1528 }
1529
1530 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1531                                            SelectionDAG &DAG) const {
1532   SDLoc DL(Op);
1533   EVT VT = Op.getValueType();
1534
1535   if (VT == MVT::i64) {
1536     SmallVector<SDValue, 2> Results;
1537     LowerUDIVREM64(Op, DAG, Results);
1538     return DAG.getMergeValues(Results, DL);
1539   }
1540
1541   if (VT == MVT::i32) {
1542     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1543       return Res;
1544   }
1545
1546   SDValue Num = Op.getOperand(0);
1547   SDValue Den = Op.getOperand(1);
1548
1549   // RCP =  URECIP(Den) = 2^32 / Den + e
1550   // e is rounding error.
1551   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1552
1553   // RCP_LO = mul(RCP, Den) */
1554   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1555
1556   // RCP_HI = mulhu (RCP, Den) */
1557   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1558
1559   // NEG_RCP_LO = -RCP_LO
1560   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1561                                                      RCP_LO);
1562
1563   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1564   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1565                                            NEG_RCP_LO, RCP_LO,
1566                                            ISD::SETEQ);
1567   // Calculate the rounding error from the URECIP instruction
1568   // E = mulhu(ABS_RCP_LO, RCP)
1569   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1570
1571   // RCP_A_E = RCP + E
1572   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1573
1574   // RCP_S_E = RCP - E
1575   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1576
1577   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1578   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1579                                      RCP_A_E, RCP_S_E,
1580                                      ISD::SETEQ);
1581   // Quotient = mulhu(Tmp0, Num)
1582   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1583
1584   // Num_S_Remainder = Quotient * Den
1585   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1586
1587   // Remainder = Num - Num_S_Remainder
1588   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1589
1590   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1591   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1592                                                  DAG.getConstant(-1, DL, VT),
1593                                                  DAG.getConstant(0, DL, VT),
1594                                                  ISD::SETUGE);
1595   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1596   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1597                                                   Num_S_Remainder,
1598                                                   DAG.getConstant(-1, DL, VT),
1599                                                   DAG.getConstant(0, DL, VT),
1600                                                   ISD::SETUGE);
1601   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1602   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1603                                                Remainder_GE_Zero);
1604
1605   // Calculate Division result:
1606
1607   // Quotient_A_One = Quotient + 1
1608   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1609                                        DAG.getConstant(1, DL, VT));
1610
1611   // Quotient_S_One = Quotient - 1
1612   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1613                                        DAG.getConstant(1, DL, VT));
1614
1615   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1616   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1617                                      Quotient, Quotient_A_One, ISD::SETEQ);
1618
1619   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1620   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1621                             Quotient_S_One, Div, ISD::SETEQ);
1622
1623   // Calculate Rem result:
1624
1625   // Remainder_S_Den = Remainder - Den
1626   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1627
1628   // Remainder_A_Den = Remainder + Den
1629   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1630
1631   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1632   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1633                                     Remainder, Remainder_S_Den, ISD::SETEQ);
1634
1635   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1636   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1637                             Remainder_A_Den, Rem, ISD::SETEQ);
1638   SDValue Ops[2] = {
1639     Div,
1640     Rem
1641   };
1642   return DAG.getMergeValues(Ops, DL);
1643 }
1644
1645 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1646                                            SelectionDAG &DAG) const {
1647   SDLoc DL(Op);
1648   EVT VT = Op.getValueType();
1649
1650   SDValue LHS = Op.getOperand(0);
1651   SDValue RHS = Op.getOperand(1);
1652
1653   SDValue Zero = DAG.getConstant(0, DL, VT);
1654   SDValue NegOne = DAG.getConstant(-1, DL, VT);
1655
1656   if (VT == MVT::i32) {
1657     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1658       return Res;
1659   }
1660
1661   if (VT == MVT::i64 &&
1662       DAG.ComputeNumSignBits(LHS) > 32 &&
1663       DAG.ComputeNumSignBits(RHS) > 32) {
1664     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1665
1666     //HiLo split
1667     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1668     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1669     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1670                                  LHS_Lo, RHS_Lo);
1671     SDValue Res[2] = {
1672       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1673       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1674     };
1675     return DAG.getMergeValues(Res, DL);
1676   }
1677
1678   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1679   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1680   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1681   SDValue RSign = LHSign; // Remainder sign is the same as LHS
1682
1683   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1684   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1685
1686   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1687   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1688
1689   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1690   SDValue Rem = Div.getValue(1);
1691
1692   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1693   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1694
1695   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1696   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1697
1698   SDValue Res[2] = {
1699     Div,
1700     Rem
1701   };
1702   return DAG.getMergeValues(Res, DL);
1703 }
1704
1705 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1706 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
1707   SDLoc SL(Op);
1708   EVT VT = Op.getValueType();
1709   SDValue X = Op.getOperand(0);
1710   SDValue Y = Op.getOperand(1);
1711
1712   // TODO: Should this propagate fast-math-flags?
1713
1714   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
1715   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
1716   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
1717
1718   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
1719 }
1720
1721 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
1722   SDLoc SL(Op);
1723   SDValue Src = Op.getOperand(0);
1724
1725   // result = trunc(src)
1726   // if (src > 0.0 && src != result)
1727   //   result += 1.0
1728
1729   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1730
1731   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1732   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1733
1734   EVT SetCCVT =
1735       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1736
1737   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1738   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1739   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1740
1741   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1742   // TODO: Should this propagate fast-math-flags?
1743   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1744 }
1745
1746 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
1747                                   SelectionDAG &DAG) {
1748   const unsigned FractBits = 52;
1749   const unsigned ExpBits = 11;
1750
1751   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
1752                                 Hi,
1753                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
1754                                 DAG.getConstant(ExpBits, SL, MVT::i32));
1755   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1756                             DAG.getConstant(1023, SL, MVT::i32));
1757
1758   return Exp;
1759 }
1760
1761 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
1762   SDLoc SL(Op);
1763   SDValue Src = Op.getOperand(0);
1764
1765   assert(Op.getValueType() == MVT::f64);
1766
1767   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1768   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1769
1770   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1771
1772   // Extract the upper half, since this is where we will find the sign and
1773   // exponent.
1774   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1775
1776   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1777
1778   const unsigned FractBits = 52;
1779
1780   // Extract the sign bit.
1781   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
1782   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1783
1784   // Extend back to to 64-bits.
1785   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
1786   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1787
1788   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1789   const SDValue FractMask
1790     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
1791
1792   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1793   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1794   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1795
1796   EVT SetCCVT =
1797       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1798
1799   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
1800
1801   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1802   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1803
1804   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
1805   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
1806
1807   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
1808 }
1809
1810 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
1811   SDLoc SL(Op);
1812   SDValue Src = Op.getOperand(0);
1813
1814   assert(Op.getValueType() == MVT::f64);
1815
1816   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1817   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
1818   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
1819
1820   // TODO: Should this propagate fast-math-flags?
1821
1822   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
1823   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
1824
1825   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
1826
1827   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1828   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
1829
1830   EVT SetCCVT =
1831       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1832   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
1833
1834   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
1835 }
1836
1837 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
1838   // FNEARBYINT and FRINT are the same, except in their handling of FP
1839   // exceptions. Those aren't really meaningful for us, and OpenCL only has
1840   // rint, so just treat them as equivalent.
1841   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
1842 }
1843
1844 // XXX - May require not supporting f32 denormals?
1845
1846 // Don't handle v2f16. The extra instructions to scalarize and repack around the
1847 // compare and vselect end up producing worse code than scalarizing the whole
1848 // operation.
1849 SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
1850   SDLoc SL(Op);
1851   SDValue X = Op.getOperand(0);
1852   EVT VT = Op.getValueType();
1853
1854   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
1855
1856   // TODO: Should this propagate fast-math-flags?
1857
1858   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
1859
1860   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
1861
1862   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
1863   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
1864   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
1865
1866   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
1867
1868   EVT SetCCVT =
1869       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1870
1871   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
1872
1873   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
1874
1875   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
1876 }
1877
1878 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
1879   SDLoc SL(Op);
1880   SDValue X = Op.getOperand(0);
1881
1882   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
1883
1884   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1885   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1886   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
1887   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
1888   EVT SetCCVT =
1889       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1890
1891   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
1892
1893   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
1894
1895   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1896
1897   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
1898                                        MVT::i64);
1899
1900   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
1901   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
1902                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
1903                                           MVT::i64),
1904                           Exp);
1905
1906   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
1907   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
1908                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
1909                               ISD::SETNE);
1910
1911   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
1912                              D, DAG.getConstant(0, SL, MVT::i64));
1913   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
1914
1915   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
1916   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
1917
1918   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1919   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1920   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
1921
1922   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
1923                             ExpEqNegOne,
1924                             DAG.getConstantFP(1.0, SL, MVT::f64),
1925                             DAG.getConstantFP(0.0, SL, MVT::f64));
1926
1927   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
1928
1929   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
1930   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
1931
1932   return K;
1933 }
1934
1935 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
1936   EVT VT = Op.getValueType();
1937
1938   if (VT == MVT::f32 || VT == MVT::f16)
1939     return LowerFROUND32_16(Op, DAG);
1940
1941   if (VT == MVT::f64)
1942     return LowerFROUND64(Op, DAG);
1943
1944   llvm_unreachable("unhandled type");
1945 }
1946
1947 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
1948   SDLoc SL(Op);
1949   SDValue Src = Op.getOperand(0);
1950
1951   // result = trunc(src);
1952   // if (src < 0.0 && src != result)
1953   //   result += -1.0.
1954
1955   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1956
1957   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1958   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
1959
1960   EVT SetCCVT =
1961       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1962
1963   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
1964   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1965   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1966
1967   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
1968   // TODO: Should this propagate fast-math-flags?
1969   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1970 }
1971
1972 SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
1973   SDLoc SL(Op);
1974   SDValue Src = Op.getOperand(0);
1975   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
1976
1977   if (ZeroUndef && Src.getValueType() == MVT::i32)
1978     return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
1979
1980   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1981
1982   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1983   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1984
1985   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1986   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1987
1988   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1989                                    *DAG.getContext(), MVT::i32);
1990
1991   SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
1992
1993   SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
1994   SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
1995
1996   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
1997   SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
1998
1999   // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2000   SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
2001
2002   if (!ZeroUndef) {
2003     // Test if the full 64-bit input is zero.
2004
2005     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2006     // which we probably don't want.
2007     SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
2008     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
2009
2010     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2011     // with the same cycles, otherwise it is slower.
2012     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2013     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2014
2015     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2016
2017     // The instruction returns -1 for 0 input, but the defined intrinsic
2018     // behavior is to return the number of bits.
2019     NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2020                           SrcIsZero, Bits32, NewCtlz);
2021   }
2022
2023   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
2024 }
2025
2026 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2027                                                bool Signed) const {
2028   // Unsigned
2029   // cul2f(ulong u)
2030   //{
2031   //  uint lz = clz(u);
2032   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
2033   //  u = (u << lz) & 0x7fffffffffffffffUL;
2034   //  ulong t = u & 0xffffffffffUL;
2035   //  uint v = (e << 23) | (uint)(u >> 40);
2036   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2037   //  return as_float(v + r);
2038   //}
2039   // Signed
2040   // cl2f(long l)
2041   //{
2042   //  long s = l >> 63;
2043   //  float r = cul2f((l + s) ^ s);
2044   //  return s ? -r : r;
2045   //}
2046
2047   SDLoc SL(Op);
2048   SDValue Src = Op.getOperand(0);
2049   SDValue L = Src;
2050
2051   SDValue S;
2052   if (Signed) {
2053     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2054     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2055
2056     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2057     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2058   }
2059
2060   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2061                                    *DAG.getContext(), MVT::f32);
2062
2063
2064   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2065   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2066   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2067   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2068
2069   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2070   SDValue E = DAG.getSelect(SL, MVT::i32,
2071     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2072     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2073     ZeroI32);
2074
2075   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2076     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2077     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2078
2079   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2080                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2081
2082   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2083                              U, DAG.getConstant(40, SL, MVT::i64));
2084
2085   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2086     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2087     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
2088
2089   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2090   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2091   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2092
2093   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2094
2095   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2096
2097   SDValue R = DAG.getSelect(SL, MVT::i32,
2098     RCmp,
2099     One,
2100     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2101   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2102   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2103
2104   if (!Signed)
2105     return R;
2106
2107   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2108   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2109 }
2110
2111 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2112                                                bool Signed) const {
2113   SDLoc SL(Op);
2114   SDValue Src = Op.getOperand(0);
2115
2116   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2117
2118   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2119                            DAG.getConstant(0, SL, MVT::i32));
2120   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2121                            DAG.getConstant(1, SL, MVT::i32));
2122
2123   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2124                               SL, MVT::f64, Hi);
2125
2126   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2127
2128   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2129                               DAG.getConstant(32, SL, MVT::i32));
2130   // TODO: Should this propagate fast-math-flags?
2131   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2132 }
2133
2134 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2135                                                SelectionDAG &DAG) const {
2136   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2137          "operation should be legal");
2138
2139   // TODO: Factor out code common with LowerSINT_TO_FP.
2140
2141   EVT DestVT = Op.getValueType();
2142   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2143     SDLoc DL(Op);
2144     SDValue Src = Op.getOperand(0);
2145
2146     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2147     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2148     SDValue FPRound =
2149         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2150
2151     return FPRound;
2152   }
2153
2154   if (DestVT == MVT::f32)
2155     return LowerINT_TO_FP32(Op, DAG, false);
2156
2157   assert(DestVT == MVT::f64);
2158   return LowerINT_TO_FP64(Op, DAG, false);
2159 }
2160
2161 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2162                                               SelectionDAG &DAG) const {
2163   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2164          "operation should be legal");
2165
2166   // TODO: Factor out code common with LowerUINT_TO_FP.
2167
2168   EVT DestVT = Op.getValueType();
2169   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2170     SDLoc DL(Op);
2171     SDValue Src = Op.getOperand(0);
2172
2173     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2174     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2175     SDValue FPRound =
2176         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2177
2178     return FPRound;
2179   }
2180
2181   if (DestVT == MVT::f32)
2182     return LowerINT_TO_FP32(Op, DAG, true);
2183
2184   assert(DestVT == MVT::f64);
2185   return LowerINT_TO_FP64(Op, DAG, true);
2186 }
2187
2188 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2189                                                bool Signed) const {
2190   SDLoc SL(Op);
2191
2192   SDValue Src = Op.getOperand(0);
2193
2194   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2195
2196   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2197                                  MVT::f64);
2198   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2199                                  MVT::f64);
2200   // TODO: Should this propagate fast-math-flags?
2201   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2202
2203   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2204
2205
2206   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2207
2208   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2209                            MVT::i32, FloorMul);
2210   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2211
2212   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2213
2214   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2215 }
2216
2217 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2218   SDLoc DL(Op);
2219   SDValue N0 = Op.getOperand(0);
2220
2221   // Convert to target node to get known bits
2222   if (N0.getValueType() == MVT::f32)
2223     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2224
2225   if (getTargetMachine().Options.UnsafeFPMath) {
2226     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2227     return SDValue();
2228   }
2229
2230   assert(N0.getSimpleValueType() == MVT::f64);
2231
2232   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2233   const unsigned ExpMask = 0x7ff;
2234   const unsigned ExpBiasf64 = 1023;
2235   const unsigned ExpBiasf16 = 15;
2236   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2237   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2238   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2239   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2240                            DAG.getConstant(32, DL, MVT::i64));
2241   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2242   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2243   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2244                           DAG.getConstant(20, DL, MVT::i64));
2245   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2246                   DAG.getConstant(ExpMask, DL, MVT::i32));
2247   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2248   // add the f16 bias (15) to get the biased exponent for the f16 format.
2249   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2250                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2251
2252   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2253                           DAG.getConstant(8, DL, MVT::i32));
2254   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2255                   DAG.getConstant(0xffe, DL, MVT::i32));
2256
2257   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2258                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2259   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2260
2261   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2262   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2263
2264   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2265   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2266       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2267                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2268
2269   // N = M | (E << 12);
2270   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2271       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2272                   DAG.getConstant(12, DL, MVT::i32)));
2273
2274   // B = clamp(1-E, 0, 13);
2275   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2276                                   One, E);
2277   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2278   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2279                   DAG.getConstant(13, DL, MVT::i32));
2280
2281   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2282                                    DAG.getConstant(0x1000, DL, MVT::i32));
2283
2284   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2285   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2286   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2287   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2288
2289   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2290   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2291                               DAG.getConstant(0x7, DL, MVT::i32));
2292   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2293                   DAG.getConstant(2, DL, MVT::i32));
2294   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2295                                One, Zero, ISD::SETEQ);
2296   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2297                                One, Zero, ISD::SETGT);
2298   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2299   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2300
2301   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2302                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2303   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2304                       I, V, ISD::SETEQ);
2305
2306   // Extract the sign bit.
2307   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2308                             DAG.getConstant(16, DL, MVT::i32));
2309   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2310                      DAG.getConstant(0x8000, DL, MVT::i32));
2311
2312   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2313   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2314 }
2315
2316 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2317                                               SelectionDAG &DAG) const {
2318   SDValue Src = Op.getOperand(0);
2319
2320   // TODO: Factor out code common with LowerFP_TO_UINT.
2321
2322   EVT SrcVT = Src.getValueType();
2323   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2324     SDLoc DL(Op);
2325
2326     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2327     SDValue FpToInt32 =
2328         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2329
2330     return FpToInt32;
2331   }
2332
2333   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2334     return LowerFP64_TO_INT(Op, DAG, true);
2335
2336   return SDValue();
2337 }
2338
2339 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2340                                               SelectionDAG &DAG) const {
2341   SDValue Src = Op.getOperand(0);
2342
2343   // TODO: Factor out code common with LowerFP_TO_SINT.
2344
2345   EVT SrcVT = Src.getValueType();
2346   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2347     SDLoc DL(Op);
2348
2349     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2350     SDValue FpToInt32 =
2351         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2352
2353     return FpToInt32;
2354   }
2355
2356   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2357     return LowerFP64_TO_INT(Op, DAG, false);
2358
2359   return SDValue();
2360 }
2361
2362 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2363                                                      SelectionDAG &DAG) const {
2364   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2365   MVT VT = Op.getSimpleValueType();
2366   MVT ScalarVT = VT.getScalarType();
2367
2368   assert(VT.isVector());
2369
2370   SDValue Src = Op.getOperand(0);
2371   SDLoc DL(Op);
2372
2373   // TODO: Don't scalarize on Evergreen?
2374   unsigned NElts = VT.getVectorNumElements();
2375   SmallVector<SDValue, 8> Args;
2376   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2377
2378   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2379   for (unsigned I = 0; I < NElts; ++I)
2380     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2381
2382   return DAG.getBuildVector(VT, DL, Args);
2383 }
2384
2385 //===----------------------------------------------------------------------===//
2386 // Custom DAG optimizations
2387 //===----------------------------------------------------------------------===//
2388
2389 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2390   KnownBits Known;
2391   EVT VT = Op.getValueType();
2392   DAG.computeKnownBits(Op, Known);
2393
2394   return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
2395 }
2396
2397 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2398   EVT VT = Op.getValueType();
2399
2400   // In order for this to be a signed 24-bit value, bit 23, must
2401   // be a sign bit.
2402   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2403                                      // as unsigned 24-bit values.
2404          (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
2405 }
2406
2407 static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
2408                         TargetLowering::DAGCombinerInfo &DCI) {
2409
2410   SelectionDAG &DAG = DCI.DAG;
2411   SDValue Op = Node24->getOperand(OpIdx);
2412   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2413   EVT VT = Op.getValueType();
2414
2415   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
2416   APInt KnownZero, KnownOne;
2417   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
2418   if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
2419     return true;
2420
2421   return false;
2422 }
2423
2424 template <typename IntTy>
2425 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2426                                uint32_t Width, const SDLoc &DL) {
2427   if (Width + Offset < 32) {
2428     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2429     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2430     return DAG.getConstant(Result, DL, MVT::i32);
2431   }
2432
2433   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2434 }
2435
2436 static bool hasVolatileUser(SDNode *Val) {
2437   for (SDNode *U : Val->uses()) {
2438     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2439       if (M->isVolatile())
2440         return true;
2441     }
2442   }
2443
2444   return false;
2445 }
2446
2447 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2448   // i32 vectors are the canonical memory type.
2449   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2450     return false;
2451
2452   if (!VT.isByteSized())
2453     return false;
2454
2455   unsigned Size = VT.getStoreSize();
2456
2457   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2458     return false;
2459
2460   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2461     return false;
2462
2463   return true;
2464 }
2465
2466 // Replace load of an illegal type with a store of a bitcast to a friendlier
2467 // type.
2468 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2469                                                  DAGCombinerInfo &DCI) const {
2470   if (!DCI.isBeforeLegalize())
2471     return SDValue();
2472
2473   LoadSDNode *LN = cast<LoadSDNode>(N);
2474   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2475     return SDValue();
2476
2477   SDLoc SL(N);
2478   SelectionDAG &DAG = DCI.DAG;
2479   EVT VT = LN->getMemoryVT();
2480
2481   unsigned Size = VT.getStoreSize();
2482   unsigned Align = LN->getAlignment();
2483   if (Align < Size && isTypeLegal(VT)) {
2484     bool IsFast;
2485     unsigned AS = LN->getAddressSpace();
2486
2487     // Expand unaligned loads earlier than legalization. Due to visitation order
2488     // problems during legalization, the emitted instructions to pack and unpack
2489     // the bytes again are not eliminated in the case of an unaligned copy.
2490     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2491       if (VT.isVector())
2492         return scalarizeVectorLoad(LN, DAG);
2493
2494       SDValue Ops[2];
2495       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2496       return DAG.getMergeValues(Ops, SDLoc(N));
2497     }
2498
2499     if (!IsFast)
2500       return SDValue();
2501   }
2502
2503   if (!shouldCombineMemoryType(VT))
2504     return SDValue();
2505
2506   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2507
2508   SDValue NewLoad
2509     = DAG.getLoad(NewVT, SL, LN->getChain(),
2510                   LN->getBasePtr(), LN->getMemOperand());
2511
2512   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2513   DCI.CombineTo(N, BC, NewLoad.getValue(1));
2514   return SDValue(N, 0);
2515 }
2516
2517 // Replace store of an illegal type with a store of a bitcast to a friendlier
2518 // type.
2519 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2520                                                   DAGCombinerInfo &DCI) const {
2521   if (!DCI.isBeforeLegalize())
2522     return SDValue();
2523
2524   StoreSDNode *SN = cast<StoreSDNode>(N);
2525   if (SN->isVolatile() || !ISD::isNormalStore(SN))
2526     return SDValue();
2527
2528   EVT VT = SN->getMemoryVT();
2529   unsigned Size = VT.getStoreSize();
2530
2531   SDLoc SL(N);
2532   SelectionDAG &DAG = DCI.DAG;
2533   unsigned Align = SN->getAlignment();
2534   if (Align < Size && isTypeLegal(VT)) {
2535     bool IsFast;
2536     unsigned AS = SN->getAddressSpace();
2537
2538     // Expand unaligned stores earlier than legalization. Due to visitation
2539     // order problems during legalization, the emitted instructions to pack and
2540     // unpack the bytes again are not eliminated in the case of an unaligned
2541     // copy.
2542     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2543       if (VT.isVector())
2544         return scalarizeVectorStore(SN, DAG);
2545
2546       return expandUnalignedStore(SN, DAG);
2547     }
2548
2549     if (!IsFast)
2550       return SDValue();
2551   }
2552
2553   if (!shouldCombineMemoryType(VT))
2554     return SDValue();
2555
2556   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2557   SDValue Val = SN->getValue();
2558
2559   //DCI.AddToWorklist(Val.getNode());
2560
2561   bool OtherUses = !Val.hasOneUse();
2562   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2563   if (OtherUses) {
2564     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2565     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2566   }
2567
2568   return DAG.getStore(SN->getChain(), SL, CastVal,
2569                       SN->getBasePtr(), SN->getMemOperand());
2570 }
2571
2572 SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
2573                                                   DAGCombinerInfo &DCI) const {
2574   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
2575   if (!CSrc)
2576     return SDValue();
2577
2578   const APFloat &F = CSrc->getValueAPF();
2579   APFloat Zero = APFloat::getZero(F.getSemantics());
2580   APFloat::cmpResult Cmp0 = F.compare(Zero);
2581   if (Cmp0 == APFloat::cmpLessThan ||
2582       (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
2583     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
2584   }
2585
2586   APFloat One(F.getSemantics(), "1.0");
2587   APFloat::cmpResult Cmp1 = F.compare(One);
2588   if (Cmp1 == APFloat::cmpGreaterThan)
2589     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
2590
2591   return SDValue(CSrc, 0);
2592 }
2593
2594 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2595 /// binary operation \p Opc to it with the corresponding constant operands.
2596 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2597   DAGCombinerInfo &DCI, const SDLoc &SL,
2598   unsigned Opc, SDValue LHS,
2599   uint32_t ValLo, uint32_t ValHi) const {
2600   SelectionDAG &DAG = DCI.DAG;
2601   SDValue Lo, Hi;
2602   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2603
2604   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2605   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
2606
2607   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
2608   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
2609
2610   // Re-visit the ands. It's possible we eliminated one of them and it could
2611   // simplify the vector.
2612   DCI.AddToWorklist(Lo.getNode());
2613   DCI.AddToWorklist(Hi.getNode());
2614
2615   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2616   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2617 }
2618
2619 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
2620                                                 DAGCombinerInfo &DCI) const {
2621   EVT VT = N->getValueType(0);
2622
2623   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2624   if (!RHS)
2625     return SDValue();
2626
2627   SDValue LHS = N->getOperand(0);
2628   unsigned RHSVal = RHS->getZExtValue();
2629   if (!RHSVal)
2630     return LHS;
2631
2632   SDLoc SL(N);
2633   SelectionDAG &DAG = DCI.DAG;
2634
2635   switch (LHS->getOpcode()) {
2636   default:
2637     break;
2638   case ISD::ZERO_EXTEND:
2639   case ISD::SIGN_EXTEND:
2640   case ISD::ANY_EXTEND: {
2641     // shl (ext x) => zext (shl x), if shift does not overflow int
2642     if (VT != MVT::i64)
2643       break;
2644     KnownBits Known;
2645     SDValue X = LHS->getOperand(0);
2646     DAG.computeKnownBits(X, Known);
2647     unsigned LZ = Known.countMinLeadingZeros();
2648     if (LZ < RHSVal)
2649       break;
2650     EVT XVT = X.getValueType();
2651     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
2652     return DAG.getZExtOrTrunc(Shl, SL, VT);
2653   }
2654   case ISD::OR:  if (!isOrEquivalentToAdd(DAG, LHS)) break;
2655   case ISD::ADD: { // Fall through from above
2656     // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
2657     if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
2658       SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),
2659                                 SDValue(RHS, 0));
2660       SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal,
2661                                     SDLoc(C2), VT);
2662       return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V);
2663     }
2664     break;
2665   }
2666   }
2667
2668   if (VT != MVT::i64)
2669     return SDValue();
2670
2671   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2672
2673   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
2674   // common case, splitting this into a move and a 32-bit shift is faster and
2675   // the same code size.
2676   if (RHSVal < 32)
2677     return SDValue();
2678
2679   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
2680
2681   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
2682   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
2683
2684   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2685
2686   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
2687   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2688 }
2689
2690 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
2691                                                 DAGCombinerInfo &DCI) const {
2692   if (N->getValueType(0) != MVT::i64)
2693     return SDValue();
2694
2695   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2696   if (!RHS)
2697     return SDValue();
2698
2699   SelectionDAG &DAG = DCI.DAG;
2700   SDLoc SL(N);
2701   unsigned RHSVal = RHS->getZExtValue();
2702
2703   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
2704   if (RHSVal == 32) {
2705     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2706     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2707                                    DAG.getConstant(31, SL, MVT::i32));
2708
2709     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
2710     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2711   }
2712
2713   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
2714   if (RHSVal == 63) {
2715     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2716     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2717                                    DAG.getConstant(31, SL, MVT::i32));
2718     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
2719     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2720   }
2721
2722   return SDValue();
2723 }
2724
2725 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
2726                                                 DAGCombinerInfo &DCI) const {
2727   if (N->getValueType(0) != MVT::i64)
2728     return SDValue();
2729
2730   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2731   if (!RHS)
2732     return SDValue();
2733
2734   unsigned ShiftAmt = RHS->getZExtValue();
2735   if (ShiftAmt < 32)
2736     return SDValue();
2737
2738   // srl i64:x, C for C >= 32
2739   // =>
2740   //   build_pair (srl hi_32(x), C - 32), 0
2741
2742   SelectionDAG &DAG = DCI.DAG;
2743   SDLoc SL(N);
2744
2745   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2746   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2747
2748   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
2749   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
2750                            VecOp, One);
2751
2752   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
2753   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
2754
2755   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
2756
2757   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
2758 }
2759
2760 // We need to specifically handle i64 mul here to avoid unnecessary conversion
2761 // instructions. If we only match on the legalized i64 mul expansion,
2762 // SimplifyDemandedBits will be unable to remove them because there will be
2763 // multiple uses due to the separate mul + mulh[su].
2764 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
2765                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
2766   if (Size <= 32) {
2767     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2768     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
2769   }
2770
2771   // Because we want to eliminate extension instructions before the
2772   // operation, we need to create a single user here (i.e. not the separate
2773   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
2774
2775   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
2776
2777   SDValue Mul = DAG.getNode(MulOpc, SL,
2778                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
2779
2780   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
2781                      Mul.getValue(0), Mul.getValue(1));
2782 }
2783
2784 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
2785                                                 DAGCombinerInfo &DCI) const {
2786   EVT VT = N->getValueType(0);
2787
2788   unsigned Size = VT.getSizeInBits();
2789   if (VT.isVector() || Size > 64)
2790     return SDValue();
2791
2792   // There are i16 integer mul/mad.
2793   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
2794     return SDValue();
2795
2796   SelectionDAG &DAG = DCI.DAG;
2797   SDLoc DL(N);
2798
2799   SDValue N0 = N->getOperand(0);
2800   SDValue N1 = N->getOperand(1);
2801   SDValue Mul;
2802
2803   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
2804     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2805     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2806     Mul = getMul24(DAG, DL, N0, N1, Size, false);
2807   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
2808     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2809     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2810     Mul = getMul24(DAG, DL, N0, N1, Size, true);
2811   } else {
2812     return SDValue();
2813   }
2814
2815   // We need to use sext even for MUL_U24, because MUL_U24 is used
2816   // for signed multiply of 8 and 16-bit types.
2817   return DAG.getSExtOrTrunc(Mul, DL, VT);
2818 }
2819
2820 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
2821                                                   DAGCombinerInfo &DCI) const {
2822   EVT VT = N->getValueType(0);
2823
2824   if (!Subtarget->hasMulI24() || VT.isVector())
2825     return SDValue();
2826
2827   SelectionDAG &DAG = DCI.DAG;
2828   SDLoc DL(N);
2829
2830   SDValue N0 = N->getOperand(0);
2831   SDValue N1 = N->getOperand(1);
2832
2833   if (!isI24(N0, DAG) || !isI24(N1, DAG))
2834     return SDValue();
2835
2836   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2837   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2838
2839   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
2840   DCI.AddToWorklist(Mulhi.getNode());
2841   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
2842 }
2843
2844 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
2845                                                   DAGCombinerInfo &DCI) const {
2846   EVT VT = N->getValueType(0);
2847
2848   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
2849     return SDValue();
2850
2851   SelectionDAG &DAG = DCI.DAG;
2852   SDLoc DL(N);
2853
2854   SDValue N0 = N->getOperand(0);
2855   SDValue N1 = N->getOperand(1);
2856
2857   if (!isU24(N0, DAG) || !isU24(N1, DAG))
2858     return SDValue();
2859
2860   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2861   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2862
2863   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
2864   DCI.AddToWorklist(Mulhi.getNode());
2865   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
2866 }
2867
2868 SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
2869   SDNode *N, DAGCombinerInfo &DCI) const {
2870   SelectionDAG &DAG = DCI.DAG;
2871
2872   // Simplify demanded bits before splitting into multiple users.
2873   if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
2874     return SDValue();
2875
2876   SDValue N0 = N->getOperand(0);
2877   SDValue N1 = N->getOperand(1);
2878
2879   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
2880
2881   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2882   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
2883
2884   SDLoc SL(N);
2885
2886   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
2887   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
2888   return DAG.getMergeValues({ MulLo, MulHi }, SL);
2889 }
2890
2891 static bool isNegativeOne(SDValue Val) {
2892   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
2893     return C->isAllOnesValue();
2894   return false;
2895 }
2896
2897 static bool isCtlzOpc(unsigned Opc) {
2898   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2899 }
2900
2901 SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
2902                                           SDValue Op,
2903                                           const SDLoc &DL) const {
2904   EVT VT = Op.getValueType();
2905   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
2906   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
2907                               LegalVT != MVT::i16))
2908     return SDValue();
2909
2910   if (VT != MVT::i32)
2911     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
2912
2913   SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
2914   if (VT != MVT::i32)
2915     FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
2916
2917   return FFBH;
2918 }
2919
2920 // The native instructions return -1 on 0 input. Optimize out a select that
2921 // produces -1 on 0.
2922 //
2923 // TODO: If zero is not undef, we could also do this if the output is compared
2924 // against the bitwidth.
2925 //
2926 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
2927 SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
2928                                                  SDValue LHS, SDValue RHS,
2929                                                  DAGCombinerInfo &DCI) const {
2930   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
2931   if (!CmpRhs || !CmpRhs->isNullValue())
2932     return SDValue();
2933
2934   SelectionDAG &DAG = DCI.DAG;
2935   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2936   SDValue CmpLHS = Cond.getOperand(0);
2937
2938   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
2939   if (CCOpcode == ISD::SETEQ &&
2940       isCtlzOpc(RHS.getOpcode()) &&
2941       RHS.getOperand(0) == CmpLHS &&
2942       isNegativeOne(LHS)) {
2943     return getFFBH_U32(DAG, CmpLHS, SL);
2944   }
2945
2946   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
2947   if (CCOpcode == ISD::SETNE &&
2948       isCtlzOpc(LHS.getOpcode()) &&
2949       LHS.getOperand(0) == CmpLHS &&
2950       isNegativeOne(RHS)) {
2951     return getFFBH_U32(DAG, CmpLHS, SL);
2952   }
2953
2954   return SDValue();
2955 }
2956
2957 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
2958                                          unsigned Op,
2959                                          const SDLoc &SL,
2960                                          SDValue Cond,
2961                                          SDValue N1,
2962                                          SDValue N2) {
2963   SelectionDAG &DAG = DCI.DAG;
2964   EVT VT = N1.getValueType();
2965
2966   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
2967                                   N1.getOperand(0), N2.getOperand(0));
2968   DCI.AddToWorklist(NewSelect.getNode());
2969   return DAG.getNode(Op, SL, VT, NewSelect);
2970 }
2971
2972 // Pull a free FP operation out of a select so it may fold into uses.
2973 //
2974 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
2975 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
2976 //
2977 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
2978 // select c, (fabs x), +k -> fabs (select c, x, k)
2979 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
2980                                     SDValue N) {
2981   SelectionDAG &DAG = DCI.DAG;
2982   SDValue Cond = N.getOperand(0);
2983   SDValue LHS = N.getOperand(1);
2984   SDValue RHS = N.getOperand(2);
2985
2986   EVT VT = N.getValueType();
2987   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
2988       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
2989     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
2990                                      SDLoc(N), Cond, LHS, RHS);
2991   }
2992
2993   bool Inv = false;
2994   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
2995     std::swap(LHS, RHS);
2996     Inv = true;
2997   }
2998
2999   // TODO: Support vector constants.
3000   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3001   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3002     SDLoc SL(N);
3003     // If one side is an fneg/fabs and the other is a constant, we can push the
3004     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3005     SDValue NewLHS = LHS.getOperand(0);
3006     SDValue NewRHS = RHS;
3007
3008     // Careful: if the neg can be folded up, don't try to pull it back down.
3009     bool ShouldFoldNeg = true;
3010
3011     if (NewLHS.hasOneUse()) {
3012       unsigned Opc = NewLHS.getOpcode();
3013       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3014         ShouldFoldNeg = false;
3015       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3016         ShouldFoldNeg = false;
3017     }
3018
3019     if (ShouldFoldNeg) {
3020       if (LHS.getOpcode() == ISD::FNEG)
3021         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3022       else if (CRHS->isNegative())
3023         return SDValue();
3024
3025       if (Inv)
3026         std::swap(NewLHS, NewRHS);
3027
3028       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3029                                       Cond, NewLHS, NewRHS);
3030       DCI.AddToWorklist(NewSelect.getNode());
3031       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3032     }
3033   }
3034
3035   return SDValue();
3036 }
3037
3038
3039 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3040                                                    DAGCombinerInfo &DCI) const {
3041   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3042     return Folded;
3043
3044   SDValue Cond = N->getOperand(0);
3045   if (Cond.getOpcode() != ISD::SETCC)
3046     return SDValue();
3047
3048   EVT VT = N->getValueType(0);
3049   SDValue LHS = Cond.getOperand(0);
3050   SDValue RHS = Cond.getOperand(1);
3051   SDValue CC = Cond.getOperand(2);
3052
3053   SDValue True = N->getOperand(1);
3054   SDValue False = N->getOperand(2);
3055
3056   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3057     SelectionDAG &DAG = DCI.DAG;
3058     if ((DAG.isConstantValueOfAnyType(True) ||
3059          DAG.isConstantValueOfAnyType(True)) &&
3060         (!DAG.isConstantValueOfAnyType(False) &&
3061          !DAG.isConstantValueOfAnyType(False))) {
3062       // Swap cmp + select pair to move constant to false input.
3063       // This will allow using VOPC cndmasks more often.
3064       // select (setcc x, y), k, x -> select (setcc y, x) x, x
3065
3066       SDLoc SL(N);
3067       ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
3068                                             LHS.getValueType().isInteger());
3069
3070       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3071       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3072     }
3073
3074     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3075       SDValue MinMax
3076         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3077       // Revisit this node so we can catch min3/max3/med3 patterns.
3078       //DCI.AddToWorklist(MinMax.getNode());
3079       return MinMax;
3080     }
3081   }
3082
3083   // There's no reason to not do this if the condition has other uses.
3084   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
3085 }
3086
3087 static bool isConstantFPZero(SDValue N) {
3088   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
3089     return C->isZero() && !C->isNegative();
3090   return false;
3091 }
3092
3093 static unsigned inverseMinMax(unsigned Opc) {
3094   switch (Opc) {
3095   case ISD::FMAXNUM:
3096     return ISD::FMINNUM;
3097   case ISD::FMINNUM:
3098     return ISD::FMAXNUM;
3099   case AMDGPUISD::FMAX_LEGACY:
3100     return AMDGPUISD::FMIN_LEGACY;
3101   case AMDGPUISD::FMIN_LEGACY:
3102     return  AMDGPUISD::FMAX_LEGACY;
3103   default:
3104     llvm_unreachable("invalid min/max opcode");
3105   }
3106 }
3107
3108 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3109                                                  DAGCombinerInfo &DCI) const {
3110   SelectionDAG &DAG = DCI.DAG;
3111   SDValue N0 = N->getOperand(0);
3112   EVT VT = N->getValueType(0);
3113
3114   unsigned Opc = N0.getOpcode();
3115
3116   // If the input has multiple uses and we can either fold the negate down, or
3117   // the other uses cannot, give up. This both prevents unprofitable
3118   // transformations and infinite loops: we won't repeatedly try to fold around
3119   // a negate that has no 'good' form.
3120   if (N0.hasOneUse()) {
3121     // This may be able to fold into the source, but at a code size cost. Don't
3122     // fold if the fold into the user is free.
3123     if (allUsesHaveSourceMods(N, 0))
3124       return SDValue();
3125   } else {
3126     if (fnegFoldsIntoOp(Opc) &&
3127         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3128       return SDValue();
3129   }
3130
3131   SDLoc SL(N);
3132   switch (Opc) {
3133   case ISD::FADD: {
3134     if (!mayIgnoreSignedZero(N0))
3135       return SDValue();
3136
3137     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3138     SDValue LHS = N0.getOperand(0);
3139     SDValue RHS = N0.getOperand(1);
3140
3141     if (LHS.getOpcode() != ISD::FNEG)
3142       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3143     else
3144       LHS = LHS.getOperand(0);
3145
3146     if (RHS.getOpcode() != ISD::FNEG)
3147       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3148     else
3149       RHS = RHS.getOperand(0);
3150
3151     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3152     if (!N0.hasOneUse())
3153       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3154     return Res;
3155   }
3156   case ISD::FMUL:
3157   case AMDGPUISD::FMUL_LEGACY: {
3158     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3159     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3160     SDValue LHS = N0.getOperand(0);
3161     SDValue RHS = N0.getOperand(1);
3162
3163     if (LHS.getOpcode() == ISD::FNEG)
3164       LHS = LHS.getOperand(0);
3165     else if (RHS.getOpcode() == ISD::FNEG)
3166       RHS = RHS.getOperand(0);
3167     else
3168       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3169
3170     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3171     if (!N0.hasOneUse())
3172       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3173     return Res;
3174   }
3175   case ISD::FMA:
3176   case ISD::FMAD: {
3177     if (!mayIgnoreSignedZero(N0))
3178       return SDValue();
3179
3180     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3181     SDValue LHS = N0.getOperand(0);
3182     SDValue MHS = N0.getOperand(1);
3183     SDValue RHS = N0.getOperand(2);
3184
3185     if (LHS.getOpcode() == ISD::FNEG)
3186       LHS = LHS.getOperand(0);
3187     else if (MHS.getOpcode() == ISD::FNEG)
3188       MHS = MHS.getOperand(0);
3189     else
3190       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3191
3192     if (RHS.getOpcode() != ISD::FNEG)
3193       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3194     else
3195       RHS = RHS.getOperand(0);
3196
3197     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3198     if (!N0.hasOneUse())
3199       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3200     return Res;
3201   }
3202   case ISD::FMAXNUM:
3203   case ISD::FMINNUM:
3204   case AMDGPUISD::FMAX_LEGACY:
3205   case AMDGPUISD::FMIN_LEGACY: {
3206     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3207     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3208     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3209     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3210
3211     SDValue LHS = N0.getOperand(0);
3212     SDValue RHS = N0.getOperand(1);
3213
3214     // 0 doesn't have a negated inline immediate.
3215     // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
3216     // operations.
3217     if (isConstantFPZero(RHS))
3218       return SDValue();
3219
3220     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3221     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3222     unsigned Opposite = inverseMinMax(Opc);
3223
3224     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3225     if (!N0.hasOneUse())
3226       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3227     return Res;
3228   }
3229   case ISD::FP_EXTEND:
3230   case ISD::FTRUNC:
3231   case ISD::FRINT:
3232   case ISD::FNEARBYINT: // XXX - Should fround be handled?
3233   case ISD::FSIN:
3234   case AMDGPUISD::RCP:
3235   case AMDGPUISD::RCP_LEGACY:
3236   case AMDGPUISD::SIN_HW: {
3237     SDValue CvtSrc = N0.getOperand(0);
3238     if (CvtSrc.getOpcode() == ISD::FNEG) {
3239       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3240       // (fneg (rcp (fneg x))) -> (rcp x)
3241       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3242     }
3243
3244     if (!N0.hasOneUse())
3245       return SDValue();
3246
3247     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3248     // (fneg (rcp x)) -> (rcp (fneg x))
3249     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3250     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3251   }
3252   case ISD::FP_ROUND: {
3253     SDValue CvtSrc = N0.getOperand(0);
3254
3255     if (CvtSrc.getOpcode() == ISD::FNEG) {
3256       // (fneg (fp_round (fneg x))) -> (fp_round x)
3257       return DAG.getNode(ISD::FP_ROUND, SL, VT,
3258                          CvtSrc.getOperand(0), N0.getOperand(1));
3259     }
3260
3261     if (!N0.hasOneUse())
3262       return SDValue();
3263
3264     // (fneg (fp_round x)) -> (fp_round (fneg x))
3265     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3266     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3267   }
3268   case ISD::FP16_TO_FP: {
3269     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3270     // f16, but legalization of f16 fneg ends up pulling it out of the source.
3271     // Put the fneg back as a legal source operation that can be matched later.
3272     SDLoc SL(N);
3273
3274     SDValue Src = N0.getOperand(0);
3275     EVT SrcVT = Src.getValueType();
3276
3277     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3278     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3279                                   DAG.getConstant(0x8000, SL, SrcVT));
3280     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3281   }
3282   default:
3283     return SDValue();
3284   }
3285 }
3286
3287 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3288                                                  DAGCombinerInfo &DCI) const {
3289   SelectionDAG &DAG = DCI.DAG;
3290   SDValue N0 = N->getOperand(0);
3291
3292   if (!N0.hasOneUse())
3293     return SDValue();
3294
3295   switch (N0.getOpcode()) {
3296   case ISD::FP16_TO_FP: {
3297     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3298     SDLoc SL(N);
3299     SDValue Src = N0.getOperand(0);
3300     EVT SrcVT = Src.getValueType();
3301
3302     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3303     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3304                                   DAG.getConstant(0x7fff, SL, SrcVT));
3305     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3306   }
3307   default:
3308     return SDValue();
3309   }
3310 }
3311
3312 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3313                                                 DAGCombinerInfo &DCI) const {
3314   SelectionDAG &DAG = DCI.DAG;
3315   SDLoc DL(N);
3316
3317   switch(N->getOpcode()) {
3318   default:
3319     break;
3320   case ISD::BITCAST: {
3321     EVT DestVT = N->getValueType(0);
3322
3323     // Push casts through vector builds. This helps avoid emitting a large
3324     // number of copies when materializing floating point vector constants.
3325     //
3326     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3327     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3328     if (DestVT.isVector()) {
3329       SDValue Src = N->getOperand(0);
3330       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3331         EVT SrcVT = Src.getValueType();
3332         unsigned NElts = DestVT.getVectorNumElements();
3333
3334         if (SrcVT.getVectorNumElements() == NElts) {
3335           EVT DestEltVT = DestVT.getVectorElementType();
3336
3337           SmallVector<SDValue, 8> CastedElts;
3338           SDLoc SL(N);
3339           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3340             SDValue Elt = Src.getOperand(I);
3341             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3342           }
3343
3344           return DAG.getBuildVector(DestVT, SL, CastedElts);
3345         }
3346       }
3347     }
3348
3349     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3350       break;
3351
3352     // Fold bitcasts of constants.
3353     //
3354     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3355     // TODO: Generalize and move to DAGCombiner
3356     SDValue Src = N->getOperand(0);
3357     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3358       assert(Src.getValueType() == MVT::i64);
3359       SDLoc SL(N);
3360       uint64_t CVal = C->getZExtValue();
3361       return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3362                          DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3363                          DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3364     }
3365
3366     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3367       const APInt &Val = C->getValueAPF().bitcastToAPInt();
3368       SDLoc SL(N);
3369       uint64_t CVal = Val.getZExtValue();
3370       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3371                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3372                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3373
3374       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3375     }
3376
3377     break;
3378   }
3379   case ISD::SHL: {
3380     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3381       break;
3382
3383     return performShlCombine(N, DCI);
3384   }
3385   case ISD::SRL: {
3386     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3387       break;
3388
3389     return performSrlCombine(N, DCI);
3390   }
3391   case ISD::SRA: {
3392     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3393       break;
3394
3395     return performSraCombine(N, DCI);
3396   }
3397   case ISD::MUL:
3398     return performMulCombine(N, DCI);
3399   case ISD::MULHS:
3400     return performMulhsCombine(N, DCI);
3401   case ISD::MULHU:
3402     return performMulhuCombine(N, DCI);
3403   case AMDGPUISD::MUL_I24:
3404   case AMDGPUISD::MUL_U24:
3405   case AMDGPUISD::MULHI_I24:
3406   case AMDGPUISD::MULHI_U24: {
3407     // If the first call to simplify is successfull, then N may end up being
3408     // deleted, so we shouldn't call simplifyI24 again.
3409     simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
3410     return SDValue();
3411   }
3412   case AMDGPUISD::MUL_LOHI_I24:
3413   case AMDGPUISD::MUL_LOHI_U24:
3414     return performMulLoHi24Combine(N, DCI);
3415   case ISD::SELECT:
3416     return performSelectCombine(N, DCI);
3417   case ISD::FNEG:
3418     return performFNegCombine(N, DCI);
3419   case ISD::FABS:
3420     return performFAbsCombine(N, DCI);
3421   case AMDGPUISD::BFE_I32:
3422   case AMDGPUISD::BFE_U32: {
3423     assert(!N->getValueType(0).isVector() &&
3424            "Vector handling of BFE not implemented");
3425     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3426     if (!Width)
3427       break;
3428
3429     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3430     if (WidthVal == 0)
3431       return DAG.getConstant(0, DL, MVT::i32);
3432
3433     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3434     if (!Offset)
3435       break;
3436
3437     SDValue BitsFrom = N->getOperand(0);
3438     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
3439
3440     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
3441
3442     if (OffsetVal == 0) {
3443       // This is already sign / zero extended, so try to fold away extra BFEs.
3444       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
3445
3446       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
3447       if (OpSignBits >= SignBits)
3448         return BitsFrom;
3449
3450       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
3451       if (Signed) {
3452         // This is a sign_extend_inreg. Replace it to take advantage of existing
3453         // DAG Combines. If not eliminated, we will match back to BFE during
3454         // selection.
3455
3456         // TODO: The sext_inreg of extended types ends, although we can could
3457         // handle them in a single BFE.
3458         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
3459                            DAG.getValueType(SmallVT));
3460       }
3461
3462       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
3463     }
3464
3465     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
3466       if (Signed) {
3467         return constantFoldBFE<int32_t>(DAG,
3468                                         CVal->getSExtValue(),
3469                                         OffsetVal,
3470                                         WidthVal,
3471                                         DL);
3472       }
3473
3474       return constantFoldBFE<uint32_t>(DAG,
3475                                        CVal->getZExtValue(),
3476                                        OffsetVal,
3477                                        WidthVal,
3478                                        DL);
3479     }
3480
3481     if ((OffsetVal + WidthVal) >= 32 &&
3482         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
3483       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
3484       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
3485                          BitsFrom, ShiftVal);
3486     }
3487
3488     if (BitsFrom.hasOneUse()) {
3489       APInt Demanded = APInt::getBitsSet(32,
3490                                          OffsetVal,
3491                                          OffsetVal + WidthVal);
3492
3493       KnownBits Known;
3494       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
3495                                             !DCI.isBeforeLegalizeOps());
3496       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3497       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
3498           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
3499         DCI.CommitTargetLoweringOpt(TLO);
3500       }
3501     }
3502
3503     break;
3504   }
3505   case ISD::LOAD:
3506     return performLoadCombine(N, DCI);
3507   case ISD::STORE:
3508     return performStoreCombine(N, DCI);
3509   case AMDGPUISD::CLAMP:
3510     return performClampCombine(N, DCI);
3511   case AMDGPUISD::RCP: {
3512     if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
3513       // XXX - Should this flush denormals?
3514       const APFloat &Val = CFP->getValueAPF();
3515       APFloat One(Val.getSemantics(), "1.0");
3516       return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3517     }
3518
3519     break;
3520   }
3521   }
3522   return SDValue();
3523 }
3524
3525 //===----------------------------------------------------------------------===//
3526 // Helper functions
3527 //===----------------------------------------------------------------------===//
3528
3529 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3530                                                    const TargetRegisterClass *RC,
3531                                                    unsigned Reg, EVT VT,
3532                                                    const SDLoc &SL,
3533                                                    bool RawReg) const {
3534   MachineFunction &MF = DAG.getMachineFunction();
3535   MachineRegisterInfo &MRI = MF.getRegInfo();
3536   unsigned VReg;
3537
3538   if (!MRI.isLiveIn(Reg)) {
3539     VReg = MRI.createVirtualRegister(RC);
3540     MRI.addLiveIn(Reg, VReg);
3541   } else {
3542     VReg = MRI.getLiveInVirtReg(Reg);
3543   }
3544
3545   if (RawReg)
3546     return DAG.getRegister(VReg, VT);
3547
3548   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
3549 }
3550
3551 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
3552     const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
3553   unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
3554   uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
3555   switch (Param) {
3556   case GRID_DIM:
3557     return ArgOffset;
3558   case GRID_OFFSET:
3559     return ArgOffset + 4;
3560   }
3561   llvm_unreachable("unexpected implicit parameter type");
3562 }
3563
3564 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
3565
3566 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
3567   switch ((AMDGPUISD::NodeType)Opcode) {
3568   case AMDGPUISD::FIRST_NUMBER: break;
3569   // AMDIL DAG nodes
3570   NODE_NAME_CASE(UMUL);
3571   NODE_NAME_CASE(BRANCH_COND);
3572
3573   // AMDGPU DAG nodes
3574   NODE_NAME_CASE(IF)
3575   NODE_NAME_CASE(ELSE)
3576   NODE_NAME_CASE(LOOP)
3577   NODE_NAME_CASE(CALL)
3578   NODE_NAME_CASE(TRAP)
3579   NODE_NAME_CASE(RET_FLAG)
3580   NODE_NAME_CASE(RETURN_TO_EPILOG)
3581   NODE_NAME_CASE(ENDPGM)
3582   NODE_NAME_CASE(DWORDADDR)
3583   NODE_NAME_CASE(FRACT)
3584   NODE_NAME_CASE(SETCC)
3585   NODE_NAME_CASE(SETREG)
3586   NODE_NAME_CASE(FMA_W_CHAIN)
3587   NODE_NAME_CASE(FMUL_W_CHAIN)
3588   NODE_NAME_CASE(CLAMP)
3589   NODE_NAME_CASE(COS_HW)
3590   NODE_NAME_CASE(SIN_HW)
3591   NODE_NAME_CASE(FMAX_LEGACY)
3592   NODE_NAME_CASE(FMIN_LEGACY)
3593   NODE_NAME_CASE(FMAX3)
3594   NODE_NAME_CASE(SMAX3)
3595   NODE_NAME_CASE(UMAX3)
3596   NODE_NAME_CASE(FMIN3)
3597   NODE_NAME_CASE(SMIN3)
3598   NODE_NAME_CASE(UMIN3)
3599   NODE_NAME_CASE(FMED3)
3600   NODE_NAME_CASE(SMED3)
3601   NODE_NAME_CASE(UMED3)
3602   NODE_NAME_CASE(URECIP)
3603   NODE_NAME_CASE(DIV_SCALE)
3604   NODE_NAME_CASE(DIV_FMAS)
3605   NODE_NAME_CASE(DIV_FIXUP)
3606   NODE_NAME_CASE(FMAD_FTZ)
3607   NODE_NAME_CASE(TRIG_PREOP)
3608   NODE_NAME_CASE(RCP)
3609   NODE_NAME_CASE(RSQ)
3610   NODE_NAME_CASE(RCP_LEGACY)
3611   NODE_NAME_CASE(RSQ_LEGACY)
3612   NODE_NAME_CASE(FMUL_LEGACY)
3613   NODE_NAME_CASE(RSQ_CLAMP)
3614   NODE_NAME_CASE(LDEXP)
3615   NODE_NAME_CASE(FP_CLASS)
3616   NODE_NAME_CASE(DOT4)
3617   NODE_NAME_CASE(CARRY)
3618   NODE_NAME_CASE(BORROW)
3619   NODE_NAME_CASE(BFE_U32)
3620   NODE_NAME_CASE(BFE_I32)
3621   NODE_NAME_CASE(BFI)
3622   NODE_NAME_CASE(BFM)
3623   NODE_NAME_CASE(FFBH_U32)
3624   NODE_NAME_CASE(FFBH_I32)
3625   NODE_NAME_CASE(MUL_U24)
3626   NODE_NAME_CASE(MUL_I24)
3627   NODE_NAME_CASE(MULHI_U24)
3628   NODE_NAME_CASE(MULHI_I24)
3629   NODE_NAME_CASE(MUL_LOHI_U24)
3630   NODE_NAME_CASE(MUL_LOHI_I24)
3631   NODE_NAME_CASE(MAD_U24)
3632   NODE_NAME_CASE(MAD_I24)
3633   NODE_NAME_CASE(TEXTURE_FETCH)
3634   NODE_NAME_CASE(EXPORT)
3635   NODE_NAME_CASE(EXPORT_DONE)
3636   NODE_NAME_CASE(R600_EXPORT)
3637   NODE_NAME_CASE(CONST_ADDRESS)
3638   NODE_NAME_CASE(REGISTER_LOAD)
3639   NODE_NAME_CASE(REGISTER_STORE)
3640   NODE_NAME_CASE(SAMPLE)
3641   NODE_NAME_CASE(SAMPLEB)
3642   NODE_NAME_CASE(SAMPLED)
3643   NODE_NAME_CASE(SAMPLEL)
3644   NODE_NAME_CASE(CVT_F32_UBYTE0)
3645   NODE_NAME_CASE(CVT_F32_UBYTE1)
3646   NODE_NAME_CASE(CVT_F32_UBYTE2)
3647   NODE_NAME_CASE(CVT_F32_UBYTE3)
3648   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
3649   NODE_NAME_CASE(FP_TO_FP16)
3650   NODE_NAME_CASE(FP16_ZEXT)
3651   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
3652   NODE_NAME_CASE(CONST_DATA_PTR)
3653   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
3654   NODE_NAME_CASE(KILL)
3655   NODE_NAME_CASE(DUMMY_CHAIN)
3656   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
3657   NODE_NAME_CASE(INIT_EXEC)
3658   NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
3659   NODE_NAME_CASE(SENDMSG)
3660   NODE_NAME_CASE(SENDMSGHALT)
3661   NODE_NAME_CASE(INTERP_MOV)
3662   NODE_NAME_CASE(INTERP_P1)
3663   NODE_NAME_CASE(INTERP_P2)
3664   NODE_NAME_CASE(STORE_MSKOR)
3665   NODE_NAME_CASE(LOAD_CONSTANT)
3666   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
3667   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
3668   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
3669   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
3670   NODE_NAME_CASE(ATOMIC_INC)
3671   NODE_NAME_CASE(ATOMIC_DEC)
3672   NODE_NAME_CASE(BUFFER_LOAD)
3673   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
3674   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
3675   }
3676   return nullptr;
3677 }
3678
3679 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
3680                                               SelectionDAG &DAG, int Enabled,
3681                                               int &RefinementSteps,
3682                                               bool &UseOneConstNR,
3683                                               bool Reciprocal) const {
3684   EVT VT = Operand.getValueType();
3685
3686   if (VT == MVT::f32) {
3687     RefinementSteps = 0;
3688     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
3689   }
3690
3691   // TODO: There is also f64 rsq instruction, but the documentation is less
3692   // clear on its precision.
3693
3694   return SDValue();
3695 }
3696
3697 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
3698                                                SelectionDAG &DAG, int Enabled,
3699                                                int &RefinementSteps) const {
3700   EVT VT = Operand.getValueType();
3701
3702   if (VT == MVT::f32) {
3703     // Reciprocal, < 1 ulp error.
3704     //
3705     // This reciprocal approximation converges to < 0.5 ulp error with one
3706     // newton rhapson performed with two fused multiple adds (FMAs).
3707
3708     RefinementSteps = 0;
3709     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
3710   }
3711
3712   // TODO: There is also f64 rcp instruction, but the documentation is less
3713   // clear on its precision.
3714
3715   return SDValue();
3716 }
3717
3718 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
3719     const SDValue Op, KnownBits &Known,
3720     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
3721
3722   Known.resetAll(); // Don't know anything.
3723
3724   KnownBits Known2;
3725   unsigned Opc = Op.getOpcode();
3726
3727   switch (Opc) {
3728   default:
3729     break;
3730   case AMDGPUISD::CARRY:
3731   case AMDGPUISD::BORROW: {
3732     Known.Zero = APInt::getHighBitsSet(32, 31);
3733     break;
3734   }
3735
3736   case AMDGPUISD::BFE_I32:
3737   case AMDGPUISD::BFE_U32: {
3738     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3739     if (!CWidth)
3740       return;
3741
3742     uint32_t Width = CWidth->getZExtValue() & 0x1f;
3743
3744     if (Opc == AMDGPUISD::BFE_U32)
3745       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
3746
3747     break;
3748   }
3749   case AMDGPUISD::FP_TO_FP16:
3750   case AMDGPUISD::FP16_ZEXT: {
3751     unsigned BitWidth = Known.getBitWidth();
3752
3753     // High bits are zero.
3754     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
3755     break;
3756   }
3757   }
3758 }
3759
3760 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
3761     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
3762     unsigned Depth) const {
3763   switch (Op.getOpcode()) {
3764   case AMDGPUISD::BFE_I32: {
3765     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3766     if (!Width)
3767       return 1;
3768
3769     unsigned SignBits = 32 - Width->getZExtValue() + 1;
3770     if (!isNullConstant(Op.getOperand(1)))
3771       return SignBits;
3772
3773     // TODO: Could probably figure something out with non-0 offsets.
3774     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
3775     return std::max(SignBits, Op0SignBits);
3776   }
3777
3778   case AMDGPUISD::BFE_U32: {
3779     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3780     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
3781   }
3782
3783   case AMDGPUISD::CARRY:
3784   case AMDGPUISD::BORROW:
3785     return 31;
3786   case AMDGPUISD::FP_TO_FP16:
3787   case AMDGPUISD::FP16_ZEXT:
3788     return 16;
3789   default:
3790     return 1;
3791   }
3792 }