contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

   1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief This is the parent TargetLowering class for hardware code gen
  12 /// targets.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPUISelLowering.h"
  17 #include "AMDGPU.h"
  18 #include "AMDGPUCallLowering.h"
  19 #include "AMDGPUFrameLowering.h"
  20 #include "AMDGPUIntrinsicInfo.h"
  21 #include "AMDGPURegisterInfo.h"
  22 #include "AMDGPUSubtarget.h"
  23 #include "R600MachineFunctionInfo.h"
  24 #include "SIMachineFunctionInfo.h"
  25 #include "llvm/CodeGen/CallingConvLower.h"
  26 #include "llvm/CodeGen/MachineFunction.h"
  27 #include "llvm/CodeGen/MachineRegisterInfo.h"
  28 #include "llvm/CodeGen/SelectionDAG.h"
  29 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  30 #include "llvm/IR/DataLayout.h"
  31 #include "llvm/IR/DiagnosticInfo.h"
  32 #include "llvm/Support/KnownBits.h"
  33 #include "SIInstrInfo.h"
  34 using namespace llvm;
  35
  36 static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
  37                             CCValAssign::LocInfo LocInfo,
  38                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
  39   MachineFunction &MF = State.getMachineFunction();
  40   AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
  41
  42   uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
  43                                          ArgFlags.getOrigAlign());
  44   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
  45   return true;
  46 }
  47
  48 static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
  49                            CCValAssign::LocInfo LocInfo,
  50                            ISD::ArgFlagsTy ArgFlags, CCState &State,
  51                            const TargetRegisterClass *RC,
  52                            unsigned NumRegs) {
  53   ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
  54   unsigned RegResult = State.AllocateReg(RegList);
  55   if (RegResult == AMDGPU::NoRegister)
  56     return false;
  57
  58   State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
  59   return true;
  60 }
  61
  62 static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
  63                               CCValAssign::LocInfo LocInfo,
  64                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
  65   switch (LocVT.SimpleTy) {
  66   case MVT::i64:
  67   case MVT::f64:
  68   case MVT::v2i32:
  69   case MVT::v2f32: {
  70     // Up to SGPR0-SGPR39
  71     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
  72                           &AMDGPU::SGPR_64RegClass, 20);
  73   }
  74   default:
  75     return false;
  76   }
  77 }
  78
  79 // Allocate up to VGPR31.
  80 //
  81 // TODO: Since there are no VGPR alignent requirements would it be better to
  82 // split into individual scalar registers?
  83 static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
  84                               CCValAssign::LocInfo LocInfo,
  85                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
  86   switch (LocVT.SimpleTy) {
  87   case MVT::i64:
  88   case MVT::f64:
  89   case MVT::v2i32:
  90   case MVT::v2f32: {
  91     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
  92                           &AMDGPU::VReg_64RegClass, 31);
  93   }
  94   case MVT::v4i32:
  95   case MVT::v4f32:
  96   case MVT::v2i64:
  97   case MVT::v2f64: {
  98     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
  99                           &AMDGPU::VReg_128RegClass, 29);
 100   }
 101   case MVT::v8i32:
 102   case MVT::v8f32: {
 103     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
 104                           &AMDGPU::VReg_256RegClass, 25);
 105
 106   }
 107   case MVT::v16i32:
 108   case MVT::v16f32: {
 109     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
 110                           &AMDGPU::VReg_512RegClass, 17);
 111
 112   }
 113   default:
 114     return false;
 115   }
 116 }
 117
 118 #include "AMDGPUGenCallingConv.inc"
 119
 120 // Find a larger type to do a load / store of a vector with.
 121 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
 122   unsigned StoreSize = VT.getStoreSizeInBits();
 123   if (StoreSize <= 32)
 124     return EVT::getIntegerVT(Ctx, StoreSize);
 125
 126   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
 127   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 128 }
 129
 130 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 131                                            const AMDGPUSubtarget &STI)
 132     : TargetLowering(TM), Subtarget(&STI) {
 133   AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
 134   // Lower floating point store/load to integer store/load to reduce the number
 135   // of patterns in tablegen.
 136   setOperationAction(ISD::LOAD, MVT::f32, Promote);
 137   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 138
 139   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
 140   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
 141
 142   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
 143   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 144
 145   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
 146   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
 147
 148   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
 149   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
 150
 151   setOperationAction(ISD::LOAD, MVT::i64, Promote);
 152   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
 153
 154   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
 155   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
 156
 157   setOperationAction(ISD::LOAD, MVT::f64, Promote);
 158   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
 159
 160   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
 161   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
 162
 163   // There are no 64-bit extloads. These should be done as a 32-bit extload and
 164   // an extension to 64-bit.
 165   for (MVT VT : MVT::integer_valuetypes()) {
 166     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
 167     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
 168     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
 169   }
 170
 171   for (MVT VT : MVT::integer_valuetypes()) {
 172     if (VT == MVT::i64)
 173       continue;
 174
 175     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 176     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
 177     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
 178     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
 179
 180     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 181     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
 182     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
 183     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
 184
 185     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 186     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
 187     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
 188     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
 189   }
 190
 191   for (MVT VT : MVT::integer_vector_valuetypes()) {
 192     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
 193     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
 194     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
 195     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
 196     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
 197     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
 198     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
 199     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
 200     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
 201     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
 202     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
 203     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
 204   }
 205
 206   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 207   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
 208   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
 209   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
 210
 211   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 212   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
 213   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
 214   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
 215
 216   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 217   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
 218   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
 219   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
 220
 221   setOperationAction(ISD::STORE, MVT::f32, Promote);
 222   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 223
 224   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
 225   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
 226
 227   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
 228   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 229
 230   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
 231   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
 232
 233   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
 234   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 235
 236   setOperationAction(ISD::STORE, MVT::i64, Promote);
 237   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
 238
 239   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
 240   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
 241
 242   setOperationAction(ISD::STORE, MVT::f64, Promote);
 243   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
 244
 245   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
 246   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
 247
 248   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
 249   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
 250   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 251   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 252
 253   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
 254   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
 255   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
 256   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
 257
 258   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 259   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
 260   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
 261   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
 262
 263   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 264   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 265
 266   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
 267   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 268
 269   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
 270   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 271
 272   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
 273   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 274
 275
 276   setOperationAction(ISD::Constant, MVT::i32, Legal);
 277   setOperationAction(ISD::Constant, MVT::i64, Legal);
 278   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 279   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 280
 281   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 282   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 283
 284   // This is totally unsupported, just custom lower to produce an error.
 285   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 286
 287   // Library functions.  These default to Expand, but we have instructions
 288   // for them.
 289   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
 290   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
 291   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
 292   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
 293   setOperationAction(ISD::FABS,   MVT::f32, Legal);
 294   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
 295   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
 296   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 297   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
 298   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
 299
 300   setOperationAction(ISD::FROUND, MVT::f32, Custom);
 301   setOperationAction(ISD::FROUND, MVT::f64, Custom);
 302
 303   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
 304   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
 305
 306   setOperationAction(ISD::FREM, MVT::f32, Custom);
 307   setOperationAction(ISD::FREM, MVT::f64, Custom);
 308
 309   // v_mad_f32 does not support denormals according to some sources.
 310   if (!Subtarget->hasFP32Denormals())
 311     setOperationAction(ISD::FMAD, MVT::f32, Legal);
 312
 313   // Expand to fneg + fadd.
 314   setOperationAction(ISD::FSUB, MVT::f64, Expand);
 315
 316   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
 317   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
 318   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
 319   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
 320   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
 321   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
 322   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
 323   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
 324   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
 325   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 326
 327   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
 328     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
 329     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
 330     setOperationAction(ISD::FRINT, MVT::f64, Custom);
 331     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
 332   }
 333
 334   if (!Subtarget->hasBFI()) {
 335     // fcopysign can be done in a single instruction with BFI.
 336     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 337     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 338   }
 339
 340   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 341   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
 342   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
 343
 344   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 345   for (MVT VT : ScalarIntVTs) {
 346     // These should use [SU]DIVREM, so set them to expand
 347     setOperationAction(ISD::SDIV, VT, Expand);
 348     setOperationAction(ISD::UDIV, VT, Expand);
 349     setOperationAction(ISD::SREM, VT, Expand);
 350     setOperationAction(ISD::UREM, VT, Expand);
 351
 352     // GPU does not have divrem function for signed or unsigned.
 353     setOperationAction(ISD::SDIVREM, VT, Custom);
 354     setOperationAction(ISD::UDIVREM, VT, Custom);
 355
 356     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
 357     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 358     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 359
 360     setOperationAction(ISD::BSWAP, VT, Expand);
 361     setOperationAction(ISD::CTTZ, VT, Expand);
 362     setOperationAction(ISD::CTLZ, VT, Expand);
 363   }
 364
 365   if (!Subtarget->hasBCNT(32))
 366     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 367
 368   if (!Subtarget->hasBCNT(64))
 369     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 370
 371   // The hardware supports 32-bit ROTR, but not ROTL.
 372   setOperationAction(ISD::ROTL, MVT::i32, Expand);
 373   setOperationAction(ISD::ROTL, MVT::i64, Expand);
 374   setOperationAction(ISD::ROTR, MVT::i64, Expand);
 375
 376   setOperationAction(ISD::MUL, MVT::i64, Expand);
 377   setOperationAction(ISD::MULHU, MVT::i64, Expand);
 378   setOperationAction(ISD::MULHS, MVT::i64, Expand);
 379   setOperationAction(ISD::UDIV, MVT::i32, Expand);
 380   setOperationAction(ISD::UREM, MVT::i32, Expand);
 381   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 382   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 383   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 384   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 385   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 386
 387   setOperationAction(ISD::SMIN, MVT::i32, Legal);
 388   setOperationAction(ISD::UMIN, MVT::i32, Legal);
 389   setOperationAction(ISD::SMAX, MVT::i32, Legal);
 390   setOperationAction(ISD::UMAX, MVT::i32, Legal);
 391
 392   if (Subtarget->hasFFBH())
 393     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
 394
 395   if (Subtarget->hasFFBL())
 396     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
 397
 398   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
 399   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 400
 401   // We only really have 32-bit BFE instructions (and 16-bit on VI).
 402   //
 403   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
 404   // effort to match them now. We want this to be false for i64 cases when the
 405   // extraction isn't restricted to the upper or lower half. Ideally we would
 406   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
 407   // span the midpoint are probably relatively rare, so don't worry about them
 408   // for now.
 409   if (Subtarget->hasBFE())
 410     setHasExtractBitsInsn(true);
 411
 412   static const MVT::SimpleValueType VectorIntTypes[] = {
 413     MVT::v2i32, MVT::v4i32
 414   };
 415
 416   for (MVT VT : VectorIntTypes) {
 417     // Expand the following operations for the current type by default.
 418     setOperationAction(ISD::ADD,  VT, Expand);
 419     setOperationAction(ISD::AND,  VT, Expand);
 420     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 421     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 422     setOperationAction(ISD::MUL,  VT, Expand);
 423     setOperationAction(ISD::MULHU, VT, Expand);
 424     setOperationAction(ISD::MULHS, VT, Expand);
 425     setOperationAction(ISD::OR,   VT, Expand);
 426     setOperationAction(ISD::SHL,  VT, Expand);
 427     setOperationAction(ISD::SRA,  VT, Expand);
 428     setOperationAction(ISD::SRL,  VT, Expand);
 429     setOperationAction(ISD::ROTL, VT, Expand);
 430     setOperationAction(ISD::ROTR, VT, Expand);
 431     setOperationAction(ISD::SUB,  VT, Expand);
 432     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 433     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 434     setOperationAction(ISD::SDIV, VT, Expand);
 435     setOperationAction(ISD::UDIV, VT, Expand);
 436     setOperationAction(ISD::SREM, VT, Expand);
 437     setOperationAction(ISD::UREM, VT, Expand);
 438     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 439     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 440     setOperationAction(ISD::SDIVREM, VT, Custom);
 441     setOperationAction(ISD::UDIVREM, VT, Expand);
 442     setOperationAction(ISD::ADDC, VT, Expand);
 443     setOperationAction(ISD::SUBC, VT, Expand);
 444     setOperationAction(ISD::ADDE, VT, Expand);
 445     setOperationAction(ISD::SUBE, VT, Expand);
 446     setOperationAction(ISD::SELECT, VT, Expand);
 447     setOperationAction(ISD::VSELECT, VT, Expand);
 448     setOperationAction(ISD::SELECT_CC, VT, Expand);
 449     setOperationAction(ISD::XOR,  VT, Expand);
 450     setOperationAction(ISD::BSWAP, VT, Expand);
 451     setOperationAction(ISD::CTPOP, VT, Expand);
 452     setOperationAction(ISD::CTTZ, VT, Expand);
 453     setOperationAction(ISD::CTLZ, VT, Expand);
 454     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 455   }
 456
 457   static const MVT::SimpleValueType FloatVectorTypes[] = {
 458     MVT::v2f32, MVT::v4f32
 459   };
 460
 461   for (MVT VT : FloatVectorTypes) {
 462     setOperationAction(ISD::FABS, VT, Expand);
 463     setOperationAction(ISD::FMINNUM, VT, Expand);
 464     setOperationAction(ISD::FMAXNUM, VT, Expand);
 465     setOperationAction(ISD::FADD, VT, Expand);
 466     setOperationAction(ISD::FCEIL, VT, Expand);
 467     setOperationAction(ISD::FCOS, VT, Expand);
 468     setOperationAction(ISD::FDIV, VT, Expand);
 469     setOperationAction(ISD::FEXP2, VT, Expand);
 470     setOperationAction(ISD::FLOG2, VT, Expand);
 471     setOperationAction(ISD::FREM, VT, Expand);
 472     setOperationAction(ISD::FPOW, VT, Expand);
 473     setOperationAction(ISD::FFLOOR, VT, Expand);
 474     setOperationAction(ISD::FTRUNC, VT, Expand);
 475     setOperationAction(ISD::FMUL, VT, Expand);
 476     setOperationAction(ISD::FMA, VT, Expand);
 477     setOperationAction(ISD::FRINT, VT, Expand);
 478     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 479     setOperationAction(ISD::FSQRT, VT, Expand);
 480     setOperationAction(ISD::FSIN, VT, Expand);
 481     setOperationAction(ISD::FSUB, VT, Expand);
 482     setOperationAction(ISD::FNEG, VT, Expand);
 483     setOperationAction(ISD::VSELECT, VT, Expand);
 484     setOperationAction(ISD::SELECT_CC, VT, Expand);
 485     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 486     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 487   }
 488
 489   // This causes using an unrolled select operation rather than expansion with
 490   // bit operations. This is in general better, but the alternative using BFI
 491   // instructions may be better if the select sources are SGPRs.
 492   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
 493   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
 494
 495   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
 496   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
 497
 498   // There are no libcalls of any kind.
 499   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
 500     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
 501
 502   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 503   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 504
 505   setSchedulingPreference(Sched::RegPressure);
 506   setJumpIsExpensive(true);
 507
 508   // FIXME: This is only partially true. If we have to do vector compares, any
 509   // SGPR pair can be a condition register. If we have a uniform condition, we
 510   // are better off doing SALU operations, where there is only one SCC. For now,
 511   // we don't have a way of knowing during instruction selection if a condition
 512   // will be uniform and we always use vector compares. Assume we are using
 513   // vector compares until that is fixed.
 514   setHasMultipleConditionRegisters(true);
 515
 516   // SI at least has hardware support for floating point exceptions, but no way
 517   // of using or handling them is implemented. They are also optional in OpenCL
 518   // (Section 7.3)
 519   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
 520
 521   PredictableSelectIsExpensive = false;
 522
 523   // We want to find all load dependencies for long chains of stores to enable
 524   // merging into very wide vectors. The problem is with vectors with > 4
 525   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
 526   // vectors are a legal type, even though we have to split the loads
 527   // usually. When we can more precisely specify load legality per address
 528   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
 529   // smarter so that they can figure out what to do in 2 iterations without all
 530   // N > 4 stores on the same chain.
 531   GatherAllAliasesMaxDepth = 16;
 532
 533   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
 534   // about these during lowering.
 535   MaxStoresPerMemcpy  = 0xffffffff;
 536   MaxStoresPerMemmove = 0xffffffff;
 537   MaxStoresPerMemset  = 0xffffffff;
 538
 539   setTargetDAGCombine(ISD::BITCAST);
 540   setTargetDAGCombine(ISD::SHL);
 541   setTargetDAGCombine(ISD::SRA);
 542   setTargetDAGCombine(ISD::SRL);
 543   setTargetDAGCombine(ISD::MUL);
 544   setTargetDAGCombine(ISD::MULHU);
 545   setTargetDAGCombine(ISD::MULHS);
 546   setTargetDAGCombine(ISD::SELECT);
 547   setTargetDAGCombine(ISD::SELECT_CC);
 548   setTargetDAGCombine(ISD::STORE);
 549   setTargetDAGCombine(ISD::FADD);
 550   setTargetDAGCombine(ISD::FSUB);
 551   setTargetDAGCombine(ISD::FNEG);
 552   setTargetDAGCombine(ISD::FABS);
 553 }
 554
 555 //===----------------------------------------------------------------------===//
 556 // Target Information
 557 //===----------------------------------------------------------------------===//
 558
 559 LLVM_READNONE
 560 static bool fnegFoldsIntoOp(unsigned Opc) {
 561   switch (Opc) {
 562   case ISD::FADD:
 563   case ISD::FSUB:
 564   case ISD::FMUL:
 565   case ISD::FMA:
 566   case ISD::FMAD:
 567   case ISD::FMINNUM:
 568   case ISD::FMAXNUM:
 569   case ISD::FSIN:
 570   case ISD::FTRUNC:
 571   case ISD::FRINT:
 572   case ISD::FNEARBYINT:
 573   case AMDGPUISD::RCP:
 574   case AMDGPUISD::RCP_LEGACY:
 575   case AMDGPUISD::SIN_HW:
 576   case AMDGPUISD::FMUL_LEGACY:
 577   case AMDGPUISD::FMIN_LEGACY:
 578   case AMDGPUISD::FMAX_LEGACY:
 579     return true;
 580   default:
 581     return false;
 582   }
 583 }
 584
 585 /// \p returns true if the operation will definitely need to use a 64-bit
 586 /// encoding, and thus will use a VOP3 encoding regardless of the source
 587 /// modifiers.
 588 LLVM_READONLY
 589 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
 590   return N->getNumOperands() > 2 || VT == MVT::f64;
 591 }
 592
 593 // Most FP instructions support source modifiers, but this could be refined
 594 // slightly.
 595 LLVM_READONLY
 596 static bool hasSourceMods(const SDNode *N) {
 597   if (isa<MemSDNode>(N))
 598     return false;
 599
 600   switch (N->getOpcode()) {
 601   case ISD::CopyToReg:
 602   case ISD::SELECT:
 603   case ISD::FDIV:
 604   case ISD::FREM:
 605   case ISD::INLINEASM:
 606   case AMDGPUISD::INTERP_P1:
 607   case AMDGPUISD::INTERP_P2:
 608   case AMDGPUISD::DIV_SCALE:
 609
 610   // TODO: Should really be looking at the users of the bitcast. These are
 611   // problematic because bitcasts are used to legalize all stores to integer
 612   // types.
 613   case ISD::BITCAST:
 614     return false;
 615   default:
 616     return true;
 617   }
 618 }
 619
 620 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
 621                                                  unsigned CostThreshold) {
 622   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
 623   // it is truly free to use a source modifier in all cases. If there are
 624   // multiple users but for each one will necessitate using VOP3, there will be
 625   // a code size increase. Try to avoid increasing code size unless we know it
 626   // will save on the instruction count.
 627   unsigned NumMayIncreaseSize = 0;
 628   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
 629
 630   // XXX - Should this limit number of uses to check?
 631   for (const SDNode *U : N->uses()) {
 632     if (!hasSourceMods(U))
 633       return false;
 634
 635     if (!opMustUseVOP3Encoding(U, VT)) {
 636       if (++NumMayIncreaseSize > CostThreshold)
 637         return false;
 638     }
 639   }
 640
 641   return true;
 642 }
 643
 644 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
 645   return MVT::i32;
 646 }
 647
 648 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
 649   return true;
 650 }
 651
 652 // The backend supports 32 and 64 bit floating point immediates.
 653 // FIXME: Why are we reporting vectors of FP immediates as legal?
 654 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 655   EVT ScalarVT = VT.getScalarType();
 656   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
 657          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
 658 }
 659
 660 // We don't want to shrink f64 / f32 constants.
 661 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
 662   EVT ScalarVT = VT.getScalarType();
 663   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
 664 }
 665
 666 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
 667                                                  ISD::LoadExtType,
 668                                                  EVT NewVT) const {
 669
 670   unsigned NewSize = NewVT.getStoreSizeInBits();
 671
 672   // If we are reducing to a 32-bit load, this is always better.
 673   if (NewSize == 32)
 674     return true;
 675
 676   EVT OldVT = N->getValueType(0);
 677   unsigned OldSize = OldVT.getStoreSizeInBits();
 678
 679   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
 680   // extloads, so doing one requires using a buffer_load. In cases where we
 681   // still couldn't use a scalar load, using the wider load shouldn't really
 682   // hurt anything.
 683
 684   // If the old size already had to be an extload, there's no harm in continuing
 685   // to reduce the width.
 686   return (OldSize < 32);
 687 }
 688
 689 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
 690                                                    EVT CastTy) const {
 691
 692   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
 693
 694   if (LoadTy.getScalarType() == MVT::i32)
 695     return false;
 696
 697   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
 698   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
 699
 700   return (LScalarSize < CastScalarSize) ||
 701          (CastScalarSize >= 32);
 702 }
 703
 704 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
 705 // profitable with the expansion for 64-bit since it's generally good to
 706 // speculate things.
 707 // FIXME: These should really have the size as a parameter.
 708 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
 709   return true;
 710 }
 711
 712 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
 713   return true;
 714 }
 715
 716 //===---------------------------------------------------------------------===//
 717 // Target Properties
 718 //===---------------------------------------------------------------------===//
 719
 720 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
 721   assert(VT.isFloatingPoint());
 722
 723   // Packed operations do not have a fabs modifier.
 724   return VT == MVT::f32 || VT == MVT::f64 ||
 725          (Subtarget->has16BitInsts() && VT == MVT::f16);
 726 }
 727
 728 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
 729   assert(VT.isFloatingPoint());
 730   return VT == MVT::f32 || VT == MVT::f64 ||
 731          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
 732          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
 733 }
 734
 735 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
 736                                                          unsigned NumElem,
 737                                                          unsigned AS) const {
 738   return true;
 739 }
 740
 741 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
 742   // There are few operations which truly have vector input operands. Any vector
 743   // operation is going to involve operations on each component, and a
 744   // build_vector will be a copy per element, so it always makes sense to use a
 745   // build_vector input in place of the extracted element to avoid a copy into a
 746   // super register.
 747   //
 748   // We should probably only do this if all users are extracts only, but this
 749   // should be the common case.
 750   return true;
 751 }
 752
 753 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
 754   // Truncate is just accessing a subregister.
 755
 756   unsigned SrcSize = Source.getSizeInBits();
 757   unsigned DestSize = Dest.getSizeInBits();
 758
 759   return DestSize < SrcSize && DestSize % 32 == 0 ;
 760 }
 761
 762 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
 763   // Truncate is just accessing a subregister.
 764
 765   unsigned SrcSize = Source->getScalarSizeInBits();
 766   unsigned DestSize = Dest->getScalarSizeInBits();
 767
 768   if (DestSize== 16 && Subtarget->has16BitInsts())
 769     return SrcSize >= 32;
 770
 771   return DestSize < SrcSize && DestSize % 32 == 0;
 772 }
 773
 774 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
 775   unsigned SrcSize = Src->getScalarSizeInBits();
 776   unsigned DestSize = Dest->getScalarSizeInBits();
 777
 778   if (SrcSize == 16 && Subtarget->has16BitInsts())
 779     return DestSize >= 32;
 780
 781   return SrcSize == 32 && DestSize == 64;
 782 }
 783
 784 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
 785   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
 786   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
 787   // this will enable reducing 64-bit operations the 32-bit, which is always
 788   // good.
 789
 790   if (Src == MVT::i16)
 791     return Dest == MVT::i32 ||Dest == MVT::i64 ;
 792
 793   return Src == MVT::i32 && Dest == MVT::i64;
 794 }
 795
 796 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
 797   return isZExtFree(Val.getValueType(), VT2);
 798 }
 799
 800 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
 801   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
 802   // limited number of native 64-bit operations. Shrinking an operation to fit
 803   // in a single 32-bit register should always be helpful. As currently used,
 804   // this is much less general than the name suggests, and is only used in
 805   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
 806   // not profitable, and may actually be harmful.
 807   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
 808 }
 809
 810 //===---------------------------------------------------------------------===//
 811 // TargetLowering Callbacks
 812 //===---------------------------------------------------------------------===//
 813
 814 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
 815                                                   bool IsVarArg) {
 816   switch (CC) {
 817   case CallingConv::AMDGPU_KERNEL:
 818   case CallingConv::SPIR_KERNEL:
 819     return CC_AMDGPU_Kernel;
 820   case CallingConv::AMDGPU_VS:
 821   case CallingConv::AMDGPU_GS:
 822   case CallingConv::AMDGPU_PS:
 823   case CallingConv::AMDGPU_CS:
 824   case CallingConv::AMDGPU_HS:
 825     return CC_AMDGPU;
 826   case CallingConv::C:
 827   case CallingConv::Fast:
 828     return CC_AMDGPU_Func;
 829   default:
 830     report_fatal_error("Unsupported calling convention.");
 831   }
 832 }
 833
 834 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 835                                                     bool IsVarArg) {
 836   switch (CC) {
 837   case CallingConv::AMDGPU_KERNEL:
 838   case CallingConv::SPIR_KERNEL:
 839     return CC_AMDGPU_Kernel;
 840   case CallingConv::AMDGPU_VS:
 841   case CallingConv::AMDGPU_GS:
 842   case CallingConv::AMDGPU_PS:
 843   case CallingConv::AMDGPU_CS:
 844   case CallingConv::AMDGPU_HS:
 845     return RetCC_SI_Shader;
 846   case CallingConv::C:
 847   case CallingConv::Fast:
 848     return RetCC_AMDGPU_Func;
 849   default:
 850     report_fatal_error("Unsupported calling convention.");
 851   }
 852 }
 853
 854 /// The SelectionDAGBuilder will automatically promote function arguments
 855 /// with illegal types.  However, this does not work for the AMDGPU targets
 856 /// since the function arguments are stored in memory as these illegal types.
 857 /// In order to handle this properly we need to get the original types sizes
 858 /// from the LLVM IR Function and fixup the ISD:InputArg values before
 859 /// passing them to AnalyzeFormalArguments()
 860
 861 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
 862 /// input values across multiple registers.  Each item in the Ins array
 863 /// represents a single value that will be stored in regsters.  Ins[x].VT is
 864 /// the value type of the value that will be stored in the register, so
 865 /// whatever SDNode we lower the argument to needs to be this type.
 866 ///
 867 /// In order to correctly lower the arguments we need to know the size of each
 868 /// argument.  Since Ins[x].VT gives us the size of the register that will
 869 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
 870 /// for the orignal function argument so that we can deduce the correct memory
 871 /// type to use for Ins[x].  In most cases the correct memory type will be
 872 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
 873 /// we have a kernel argument of type v8i8, this argument will be split into
 874 /// 8 parts and each part will be represented by its own item in the Ins array.
 875 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
 876 /// the argument before it was split.  From this, we deduce that the memory type
 877 /// for each individual part is i8.  We pass the memory type as LocVT to the
 878 /// calling convention analysis function and the register type (Ins[x].VT) as
 879 /// the ValVT.
 880 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
 881                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
 882   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
 883     const ISD::InputArg &In = Ins[i];
 884     EVT MemVT;
 885
 886     unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
 887
 888     if (!Subtarget->isAmdHsaOS() &&
 889         (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
 890       // The ABI says the caller will extend these values to 32-bits.
 891       MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
 892     } else if (NumRegs == 1) {
 893       // This argument is not split, so the IR type is the memory type.
 894       assert(!In.Flags.isSplit());
 895       if (In.ArgVT.isExtended()) {
 896         // We have an extended type, like i24, so we should just use the register type
 897         MemVT = In.VT;
 898       } else {
 899         MemVT = In.ArgVT;
 900       }
 901     } else if (In.ArgVT.isVector() && In.VT.isVector() &&
 902                In.ArgVT.getScalarType() == In.VT.getScalarType()) {
 903       assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
 904       // We have a vector value which has been split into a vector with
 905       // the same scalar type, but fewer elements.  This should handle
 906       // all the floating-point vector types.
 907       MemVT = In.VT;
 908     } else if (In.ArgVT.isVector() &&
 909                In.ArgVT.getVectorNumElements() == NumRegs) {
 910       // This arg has been split so that each element is stored in a separate
 911       // register.
 912       MemVT = In.ArgVT.getScalarType();
 913     } else if (In.ArgVT.isExtended()) {
 914       // We have an extended type, like i65.
 915       MemVT = In.VT;
 916     } else {
 917       unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
 918       assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
 919       if (In.VT.isInteger()) {
 920         MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
 921       } else if (In.VT.isVector()) {
 922         assert(!In.VT.getScalarType().isFloatingPoint());
 923         unsigned NumElements = In.VT.getVectorNumElements();
 924         assert(MemoryBits % NumElements == 0);
 925         // This vector type has been split into another vector type with
 926         // a different elements size.
 927         EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
 928                                          MemoryBits / NumElements);
 929         MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
 930       } else {
 931         llvm_unreachable("cannot deduce memory type.");
 932       }
 933     }
 934
 935     // Convert one element vectors to scalar.
 936     if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
 937       MemVT = MemVT.getScalarType();
 938
 939     if (MemVT.isExtended()) {
 940       // This should really only happen if we have vec3 arguments
 941       assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
 942       MemVT = MemVT.getPow2VectorType(State.getContext());
 943     }
 944
 945     assert(MemVT.isSimple());
 946     allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
 947                     State);
 948   }
 949 }
 950
 951 SDValue AMDGPUTargetLowering::LowerReturn(
 952   SDValue Chain, CallingConv::ID CallConv,
 953   bool isVarArg,
 954   const SmallVectorImpl<ISD::OutputArg> &Outs,
 955   const SmallVectorImpl<SDValue> &OutVals,
 956   const SDLoc &DL, SelectionDAG &DAG) const {
 957   // FIXME: Fails for r600 tests
 958   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
 959   // "wave terminate should not have return values");
 960   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
 961 }
 962
 963 //===---------------------------------------------------------------------===//
 964 // Target specific lowering
 965 //===---------------------------------------------------------------------===//
 966
 967 /// Selects the correct CCAssignFn for a given CallingConvention value.
 968 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
 969                                                     bool IsVarArg) {
 970   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
 971 }
 972
 973 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
 974                                                       bool IsVarArg) {
 975   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
 976 }
 977
 978 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
 979                                         SmallVectorImpl<SDValue> &InVals) const {
 980   SDValue Callee = CLI.Callee;
 981   SelectionDAG &DAG = CLI.DAG;
 982
 983   const Function &Fn = *DAG.getMachineFunction().getFunction();
 984
 985   StringRef FuncName("<unknown>");
 986
 987   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
 988     FuncName = G->getSymbol();
 989   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
 990     FuncName = G->getGlobal()->getName();
 991
 992   DiagnosticInfoUnsupported NoCalls(
 993       Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
 994   DAG.getContext()->diagnose(NoCalls);
 995
 996   if (!CLI.IsTailCall) {
 997     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
 998       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
 999   }
1000
1001   return DAG.getEntryNode();
1002 }
1003
1004 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1005                                                       SelectionDAG &DAG) const {
1006   const Function &Fn = *DAG.getMachineFunction().getFunction();
1007
1008   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1009                                             SDLoc(Op).getDebugLoc());
1010   DAG.getContext()->diagnose(NoDynamicAlloca);
1011   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1012   return DAG.getMergeValues(Ops, SDLoc());
1013 }
1014
1015 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1016                                              SelectionDAG &DAG) const {
1017   switch (Op.getOpcode()) {
1018   default:
1019     Op->print(errs(), &DAG);
1020     llvm_unreachable("Custom lowering code for this"
1021                      "instruction is not implemented yet!");
1022     break;
1023   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1024   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1025   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1026   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1027   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1028   case ISD::FREM: return LowerFREM(Op, DAG);
1029   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1030   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1031   case ISD::FRINT: return LowerFRINT(Op, DAG);
1032   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1033   case ISD::FROUND: return LowerFROUND(Op, DAG);
1034   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1035   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1036   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1037   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1038   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1039   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1040   case ISD::CTLZ:
1041   case ISD::CTLZ_ZERO_UNDEF:
1042     return LowerCTLZ(Op, DAG);
1043   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1044   }
1045   return Op;
1046 }
1047
1048 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1049                                               SmallVectorImpl<SDValue> &Results,
1050                                               SelectionDAG &DAG) const {
1051   switch (N->getOpcode()) {
1052   case ISD::SIGN_EXTEND_INREG:
1053     // Different parts of legalization seem to interpret which type of
1054     // sign_extend_inreg is the one to check for custom lowering. The extended
1055     // from type is what really matters, but some places check for custom
1056     // lowering of the result type. This results in trying to use
1057     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1058     // nothing here and let the illegal result integer be handled normally.
1059     return;
1060   default:
1061     return;
1062   }
1063 }
1064
1065 static bool hasDefinedInitializer(const GlobalValue *GV) {
1066   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1067   if (!GVar || !GVar->hasInitializer())
1068     return false;
1069
1070   return !isa<UndefValue>(GVar->getInitializer());
1071 }
1072
1073 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1074                                                  SDValue Op,
1075                                                  SelectionDAG &DAG) const {
1076
1077   const DataLayout &DL = DAG.getDataLayout();
1078   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1079   const GlobalValue *GV = G->getGlobal();
1080
1081   if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
1082     // XXX: What does the value of G->getOffset() mean?
1083     assert(G->getOffset() == 0 &&
1084          "Do not know what to do with an non-zero offset");
1085
1086     // TODO: We could emit code to handle the initialization somewhere.
1087     if (!hasDefinedInitializer(GV)) {
1088       unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1089       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1090     }
1091   }
1092
1093   const Function &Fn = *DAG.getMachineFunction().getFunction();
1094   DiagnosticInfoUnsupported BadInit(
1095       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1096   DAG.getContext()->diagnose(BadInit);
1097   return SDValue();
1098 }
1099
1100 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1101                                                   SelectionDAG &DAG) const {
1102   SmallVector<SDValue, 8> Args;
1103
1104   for (const SDUse &U : Op->ops())
1105     DAG.ExtractVectorElements(U.get(), Args);
1106
1107   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1108 }
1109
1110 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1111                                                      SelectionDAG &DAG) const {
1112
1113   SmallVector<SDValue, 8> Args;
1114   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1115   EVT VT = Op.getValueType();
1116   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1117                             VT.getVectorNumElements());
1118
1119   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1120 }
1121
1122 /// \brief Generate Min/Max node
1123 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1124                                                    SDValue LHS, SDValue RHS,
1125                                                    SDValue True, SDValue False,
1126                                                    SDValue CC,
1127                                                    DAGCombinerInfo &DCI) const {
1128   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1129     return SDValue();
1130
1131   SelectionDAG &DAG = DCI.DAG;
1132   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1133   switch (CCOpcode) {
1134   case ISD::SETOEQ:
1135   case ISD::SETONE:
1136   case ISD::SETUNE:
1137   case ISD::SETNE:
1138   case ISD::SETUEQ:
1139   case ISD::SETEQ:
1140   case ISD::SETFALSE:
1141   case ISD::SETFALSE2:
1142   case ISD::SETTRUE:
1143   case ISD::SETTRUE2:
1144   case ISD::SETUO:
1145   case ISD::SETO:
1146     break;
1147   case ISD::SETULE:
1148   case ISD::SETULT: {
1149     if (LHS == True)
1150       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1151     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1152   }
1153   case ISD::SETOLE:
1154   case ISD::SETOLT:
1155   case ISD::SETLE:
1156   case ISD::SETLT: {
1157     // Ordered. Assume ordered for undefined.
1158
1159     // Only do this after legalization to avoid interfering with other combines
1160     // which might occur.
1161     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1162         !DCI.isCalledByLegalizer())
1163       return SDValue();
1164
1165     // We need to permute the operands to get the correct NaN behavior. The
1166     // selected operand is the second one based on the failing compare with NaN,
1167     // so permute it based on the compare type the hardware uses.
1168     if (LHS == True)
1169       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1170     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1171   }
1172   case ISD::SETUGE:
1173   case ISD::SETUGT: {
1174     if (LHS == True)
1175       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1176     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1177   }
1178   case ISD::SETGT:
1179   case ISD::SETGE:
1180   case ISD::SETOGE:
1181   case ISD::SETOGT: {
1182     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1183         !DCI.isCalledByLegalizer())
1184       return SDValue();
1185
1186     if (LHS == True)
1187       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1188     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1189   }
1190   case ISD::SETCC_INVALID:
1191     llvm_unreachable("Invalid setcc condcode!");
1192   }
1193   return SDValue();
1194 }
1195
1196 std::pair<SDValue, SDValue>
1197 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1198   SDLoc SL(Op);
1199
1200   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1201
1202   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1203   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1204
1205   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1206   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1207
1208   return std::make_pair(Lo, Hi);
1209 }
1210
1211 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1212   SDLoc SL(Op);
1213
1214   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1215   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1216   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1217 }
1218
1219 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1220   SDLoc SL(Op);
1221
1222   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1223   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1224   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1225 }
1226
1227 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1228                                               SelectionDAG &DAG) const {
1229   LoadSDNode *Load = cast<LoadSDNode>(Op);
1230   EVT VT = Op.getValueType();
1231
1232
1233   // If this is a 2 element vector, we really want to scalarize and not create
1234   // weird 1 element vectors.
1235   if (VT.getVectorNumElements() == 2)
1236     return scalarizeVectorLoad(Load, DAG);
1237
1238   SDValue BasePtr = Load->getBasePtr();
1239   EVT PtrVT = BasePtr.getValueType();
1240   EVT MemVT = Load->getMemoryVT();
1241   SDLoc SL(Op);
1242
1243   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1244
1245   EVT LoVT, HiVT;
1246   EVT LoMemVT, HiMemVT;
1247   SDValue Lo, Hi;
1248
1249   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1250   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1251   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
1252
1253   unsigned Size = LoMemVT.getStoreSize();
1254   unsigned BaseAlign = Load->getAlignment();
1255   unsigned HiAlign = MinAlign(BaseAlign, Size);
1256
1257   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1258                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1259                                   BaseAlign, Load->getMemOperand()->getFlags());
1260   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1261                               DAG.getConstant(Size, SL, PtrVT));
1262   SDValue HiLoad =
1263       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1264                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1265                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1266
1267   SDValue Ops[] = {
1268     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
1269     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1270                 LoLoad.getValue(1), HiLoad.getValue(1))
1271   };
1272
1273   return DAG.getMergeValues(Ops, SL);
1274 }
1275
1276 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1277                                                SelectionDAG &DAG) const {
1278   StoreSDNode *Store = cast<StoreSDNode>(Op);
1279   SDValue Val = Store->getValue();
1280   EVT VT = Val.getValueType();
1281
1282   // If this is a 2 element vector, we really want to scalarize and not create
1283   // weird 1 element vectors.
1284   if (VT.getVectorNumElements() == 2)
1285     return scalarizeVectorStore(Store, DAG);
1286
1287   EVT MemVT = Store->getMemoryVT();
1288   SDValue Chain = Store->getChain();
1289   SDValue BasePtr = Store->getBasePtr();
1290   SDLoc SL(Op);
1291
1292   EVT LoVT, HiVT;
1293   EVT LoMemVT, HiMemVT;
1294   SDValue Lo, Hi;
1295
1296   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1297   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1298   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
1299
1300   EVT PtrVT = BasePtr.getValueType();
1301   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1302                               DAG.getConstant(LoMemVT.getStoreSize(), SL,
1303                                               PtrVT));
1304
1305   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1306   unsigned BaseAlign = Store->getAlignment();
1307   unsigned Size = LoMemVT.getStoreSize();
1308   unsigned HiAlign = MinAlign(BaseAlign, Size);
1309
1310   SDValue LoStore =
1311       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1312                         Store->getMemOperand()->getFlags());
1313   SDValue HiStore =
1314       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1315                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1316
1317   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1318 }
1319
1320 // This is a shortcut for integer division because we have fast i32<->f32
1321 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1322 // float is enough to accurately represent up to a 24-bit signed integer.
1323 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1324                                             bool Sign) const {
1325   SDLoc DL(Op);
1326   EVT VT = Op.getValueType();
1327   SDValue LHS = Op.getOperand(0);
1328   SDValue RHS = Op.getOperand(1);
1329   MVT IntVT = MVT::i32;
1330   MVT FltVT = MVT::f32;
1331
1332   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1333   if (LHSSignBits < 9)
1334     return SDValue();
1335
1336   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1337   if (RHSSignBits < 9)
1338     return SDValue();
1339
1340   unsigned BitSize = VT.getSizeInBits();
1341   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1342   unsigned DivBits = BitSize - SignBits;
1343   if (Sign)
1344     ++DivBits;
1345
1346   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1347   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1348
1349   SDValue jq = DAG.getConstant(1, DL, IntVT);
1350
1351   if (Sign) {
1352     // char|short jq = ia ^ ib;
1353     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1354
1355     // jq = jq >> (bitsize - 2)
1356     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1357                      DAG.getConstant(BitSize - 2, DL, VT));
1358
1359     // jq = jq | 0x1
1360     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1361   }
1362
1363   // int ia = (int)LHS;
1364   SDValue ia = LHS;
1365
1366   // int ib, (int)RHS;
1367   SDValue ib = RHS;
1368
1369   // float fa = (float)ia;
1370   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1371
1372   // float fb = (float)ib;
1373   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1374
1375   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1376                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1377
1378   // fq = trunc(fq);
1379   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1380
1381   // float fqneg = -fq;
1382   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1383
1384   // float fr = mad(fqneg, fb, fa);
1385   unsigned OpCode = Subtarget->hasFP32Denormals() ?
1386                     (unsigned)AMDGPUISD::FMAD_FTZ :
1387                     (unsigned)ISD::FMAD;
1388   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1389
1390   // int iq = (int)fq;
1391   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1392
1393   // fr = fabs(fr);
1394   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1395
1396   // fb = fabs(fb);
1397   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1398
1399   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1400
1401   // int cv = fr >= fb;
1402   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1403
1404   // jq = (cv ? jq : 0);
1405   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1406
1407   // dst = iq + jq;
1408   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1409
1410   // Rem needs compensation, it's easier to recompute it
1411   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1412   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1413
1414   // Truncate to number of bits this divide really is.
1415   if (Sign) {
1416     SDValue InRegSize
1417       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1418     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1419     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1420   } else {
1421     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1422     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1423     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1424   }
1425
1426   return DAG.getMergeValues({ Div, Rem }, DL);
1427 }
1428
1429 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1430                                       SelectionDAG &DAG,
1431                                       SmallVectorImpl<SDValue> &Results) const {
1432   assert(Op.getValueType() == MVT::i64);
1433
1434   SDLoc DL(Op);
1435   EVT VT = Op.getValueType();
1436   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1437
1438   SDValue one = DAG.getConstant(1, DL, HalfVT);
1439   SDValue zero = DAG.getConstant(0, DL, HalfVT);
1440
1441   //HiLo split
1442   SDValue LHS = Op.getOperand(0);
1443   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
1444   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
1445
1446   SDValue RHS = Op.getOperand(1);
1447   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
1448   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
1449
1450   if (VT == MVT::i64 &&
1451     DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1452     DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1453
1454     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1455                               LHS_Lo, RHS_Lo);
1456
1457     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
1458     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
1459
1460     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1461     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1462     return;
1463   }
1464
1465   // Get Speculative values
1466   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1467   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1468
1469   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
1470   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
1471   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1472
1473   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
1474   SDValue DIV_Lo = zero;
1475
1476   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1477
1478   for (unsigned i = 0; i < halfBitWidth; ++i) {
1479     const unsigned bitPos = halfBitWidth - i - 1;
1480     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1481     // Get value of high bit
1482     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1483     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
1484     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1485
1486     // Shift
1487     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1488     // Add LHS high bit
1489     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1490
1491     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1492     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
1493
1494     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1495
1496     // Update REM
1497     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1498     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1499   }
1500
1501   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1502   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1503   Results.push_back(DIV);
1504   Results.push_back(REM);
1505 }
1506
1507 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1508                                            SelectionDAG &DAG) const {
1509   SDLoc DL(Op);
1510   EVT VT = Op.getValueType();
1511
1512   if (VT == MVT::i64) {
1513     SmallVector<SDValue, 2> Results;
1514     LowerUDIVREM64(Op, DAG, Results);
1515     return DAG.getMergeValues(Results, DL);
1516   }
1517
1518   if (VT == MVT::i32) {
1519     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1520       return Res;
1521   }
1522
1523   SDValue Num = Op.getOperand(0);
1524   SDValue Den = Op.getOperand(1);
1525
1526   // RCP =  URECIP(Den) = 2^32 / Den + e
1527   // e is rounding error.
1528   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1529
1530   // RCP_LO = mul(RCP, Den) */
1531   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1532
1533   // RCP_HI = mulhu (RCP, Den) */
1534   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1535
1536   // NEG_RCP_LO = -RCP_LO
1537   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1538                                                      RCP_LO);
1539
1540   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1541   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1542                                            NEG_RCP_LO, RCP_LO,
1543                                            ISD::SETEQ);
1544   // Calculate the rounding error from the URECIP instruction
1545   // E = mulhu(ABS_RCP_LO, RCP)
1546   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1547
1548   // RCP_A_E = RCP + E
1549   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1550
1551   // RCP_S_E = RCP - E
1552   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1553
1554   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1555   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1556                                      RCP_A_E, RCP_S_E,
1557                                      ISD::SETEQ);
1558   // Quotient = mulhu(Tmp0, Num)
1559   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1560
1561   // Num_S_Remainder = Quotient * Den
1562   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1563
1564   // Remainder = Num - Num_S_Remainder
1565   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1566
1567   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1568   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1569                                                  DAG.getConstant(-1, DL, VT),
1570                                                  DAG.getConstant(0, DL, VT),
1571                                                  ISD::SETUGE);
1572   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1573   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1574                                                   Num_S_Remainder,
1575                                                   DAG.getConstant(-1, DL, VT),
1576                                                   DAG.getConstant(0, DL, VT),
1577                                                   ISD::SETUGE);
1578   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1579   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1580                                                Remainder_GE_Zero);
1581
1582   // Calculate Division result:
1583
1584   // Quotient_A_One = Quotient + 1
1585   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1586                                        DAG.getConstant(1, DL, VT));
1587
1588   // Quotient_S_One = Quotient - 1
1589   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1590                                        DAG.getConstant(1, DL, VT));
1591
1592   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1593   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1594                                      Quotient, Quotient_A_One, ISD::SETEQ);
1595
1596   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1597   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1598                             Quotient_S_One, Div, ISD::SETEQ);
1599
1600   // Calculate Rem result:
1601
1602   // Remainder_S_Den = Remainder - Den
1603   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1604
1605   // Remainder_A_Den = Remainder + Den
1606   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1607
1608   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1609   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1610                                     Remainder, Remainder_S_Den, ISD::SETEQ);
1611
1612   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1613   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1614                             Remainder_A_Den, Rem, ISD::SETEQ);
1615   SDValue Ops[2] = {
1616     Div,
1617     Rem
1618   };
1619   return DAG.getMergeValues(Ops, DL);
1620 }
1621
1622 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1623                                            SelectionDAG &DAG) const {
1624   SDLoc DL(Op);
1625   EVT VT = Op.getValueType();
1626
1627   SDValue LHS = Op.getOperand(0);
1628   SDValue RHS = Op.getOperand(1);
1629
1630   SDValue Zero = DAG.getConstant(0, DL, VT);
1631   SDValue NegOne = DAG.getConstant(-1, DL, VT);
1632
1633   if (VT == MVT::i32) {
1634     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1635       return Res;
1636   }
1637
1638   if (VT == MVT::i64 &&
1639       DAG.ComputeNumSignBits(LHS) > 32 &&
1640       DAG.ComputeNumSignBits(RHS) > 32) {
1641     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1642
1643     //HiLo split
1644     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1645     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1646     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1647                                  LHS_Lo, RHS_Lo);
1648     SDValue Res[2] = {
1649       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1650       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1651     };
1652     return DAG.getMergeValues(Res, DL);
1653   }
1654
1655   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1656   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1657   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1658   SDValue RSign = LHSign; // Remainder sign is the same as LHS
1659
1660   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1661   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1662
1663   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1664   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1665
1666   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1667   SDValue Rem = Div.getValue(1);
1668
1669   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1670   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1671
1672   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1673   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1674
1675   SDValue Res[2] = {
1676     Div,
1677     Rem
1678   };
1679   return DAG.getMergeValues(Res, DL);
1680 }
1681
1682 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1683 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
1684   SDLoc SL(Op);
1685   EVT VT = Op.getValueType();
1686   SDValue X = Op.getOperand(0);
1687   SDValue Y = Op.getOperand(1);
1688
1689   // TODO: Should this propagate fast-math-flags?
1690
1691   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
1692   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
1693   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
1694
1695   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
1696 }
1697
1698 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
1699   SDLoc SL(Op);
1700   SDValue Src = Op.getOperand(0);
1701
1702   // result = trunc(src)
1703   // if (src > 0.0 && src != result)
1704   //   result += 1.0
1705
1706   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1707
1708   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1709   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1710
1711   EVT SetCCVT =
1712       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1713
1714   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1715   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1716   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1717
1718   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1719   // TODO: Should this propagate fast-math-flags?
1720   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1721 }
1722
1723 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
1724                                   SelectionDAG &DAG) {
1725   const unsigned FractBits = 52;
1726   const unsigned ExpBits = 11;
1727
1728   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
1729                                 Hi,
1730                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
1731                                 DAG.getConstant(ExpBits, SL, MVT::i32));
1732   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1733                             DAG.getConstant(1023, SL, MVT::i32));
1734
1735   return Exp;
1736 }
1737
1738 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
1739   SDLoc SL(Op);
1740   SDValue Src = Op.getOperand(0);
1741
1742   assert(Op.getValueType() == MVT::f64);
1743
1744   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1745   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1746
1747   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1748
1749   // Extract the upper half, since this is where we will find the sign and
1750   // exponent.
1751   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1752
1753   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1754
1755   const unsigned FractBits = 52;
1756
1757   // Extract the sign bit.
1758   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
1759   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1760
1761   // Extend back to to 64-bits.
1762   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
1763   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1764
1765   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1766   const SDValue FractMask
1767     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
1768
1769   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1770   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1771   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1772
1773   EVT SetCCVT =
1774       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1775
1776   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
1777
1778   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1779   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1780
1781   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
1782   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
1783
1784   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
1785 }
1786
1787 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
1788   SDLoc SL(Op);
1789   SDValue Src = Op.getOperand(0);
1790
1791   assert(Op.getValueType() == MVT::f64);
1792
1793   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1794   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
1795   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
1796
1797   // TODO: Should this propagate fast-math-flags?
1798
1799   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
1800   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
1801
1802   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
1803
1804   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1805   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
1806
1807   EVT SetCCVT =
1808       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1809   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
1810
1811   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
1812 }
1813
1814 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
1815   // FNEARBYINT and FRINT are the same, except in their handling of FP
1816   // exceptions. Those aren't really meaningful for us, and OpenCL only has
1817   // rint, so just treat them as equivalent.
1818   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
1819 }
1820
1821 // XXX - May require not supporting f32 denormals?
1822
1823 // Don't handle v2f16. The extra instructions to scalarize and repack around the
1824 // compare and vselect end up producing worse code than scalarizing the whole
1825 // operation.
1826 SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
1827   SDLoc SL(Op);
1828   SDValue X = Op.getOperand(0);
1829   EVT VT = Op.getValueType();
1830
1831   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
1832
1833   // TODO: Should this propagate fast-math-flags?
1834
1835   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
1836
1837   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
1838
1839   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
1840   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
1841   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
1842
1843   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
1844
1845   EVT SetCCVT =
1846       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1847
1848   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
1849
1850   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
1851
1852   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
1853 }
1854
1855 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
1856   SDLoc SL(Op);
1857   SDValue X = Op.getOperand(0);
1858
1859   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
1860
1861   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1862   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1863   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
1864   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
1865   EVT SetCCVT =
1866       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1867
1868   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
1869
1870   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
1871
1872   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1873
1874   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
1875                                        MVT::i64);
1876
1877   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
1878   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
1879                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
1880                                           MVT::i64),
1881                           Exp);
1882
1883   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
1884   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
1885                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
1886                               ISD::SETNE);
1887
1888   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
1889                              D, DAG.getConstant(0, SL, MVT::i64));
1890   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
1891
1892   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
1893   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
1894
1895   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1896   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1897   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
1898
1899   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
1900                             ExpEqNegOne,
1901                             DAG.getConstantFP(1.0, SL, MVT::f64),
1902                             DAG.getConstantFP(0.0, SL, MVT::f64));
1903
1904   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
1905
1906   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
1907   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
1908
1909   return K;
1910 }
1911
1912 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
1913   EVT VT = Op.getValueType();
1914
1915   if (VT == MVT::f32 || VT == MVT::f16)
1916     return LowerFROUND32_16(Op, DAG);
1917
1918   if (VT == MVT::f64)
1919     return LowerFROUND64(Op, DAG);
1920
1921   llvm_unreachable("unhandled type");
1922 }
1923
1924 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
1925   SDLoc SL(Op);
1926   SDValue Src = Op.getOperand(0);
1927
1928   // result = trunc(src);
1929   // if (src < 0.0 && src != result)
1930   //   result += -1.0.
1931
1932   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1933
1934   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1935   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
1936
1937   EVT SetCCVT =
1938       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1939
1940   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
1941   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1942   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1943
1944   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
1945   // TODO: Should this propagate fast-math-flags?
1946   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1947 }
1948
1949 SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
1950   SDLoc SL(Op);
1951   SDValue Src = Op.getOperand(0);
1952   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
1953
1954   if (ZeroUndef && Src.getValueType() == MVT::i32)
1955     return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
1956
1957   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1958
1959   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1960   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1961
1962   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1963   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1964
1965   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1966                                    *DAG.getContext(), MVT::i32);
1967
1968   SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
1969
1970   SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
1971   SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
1972
1973   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
1974   SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
1975
1976   // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
1977   SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
1978
1979   if (!ZeroUndef) {
1980     // Test if the full 64-bit input is zero.
1981
1982     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
1983     // which we probably don't want.
1984     SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
1985     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
1986
1987     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
1988     // with the same cycles, otherwise it is slower.
1989     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
1990     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
1991
1992     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
1993
1994     // The instruction returns -1 for 0 input, but the defined intrinsic
1995     // behavior is to return the number of bits.
1996     NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
1997                           SrcIsZero, Bits32, NewCtlz);
1998   }
1999
2000   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
2001 }
2002
2003 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2004                                                bool Signed) const {
2005   // Unsigned
2006   // cul2f(ulong u)
2007   //{
2008   //  uint lz = clz(u);
2009   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
2010   //  u = (u << lz) & 0x7fffffffffffffffUL;
2011   //  ulong t = u & 0xffffffffffUL;
2012   //  uint v = (e << 23) | (uint)(u >> 40);
2013   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2014   //  return as_float(v + r);
2015   //}
2016   // Signed
2017   // cl2f(long l)
2018   //{
2019   //  long s = l >> 63;
2020   //  float r = cul2f((l + s) ^ s);
2021   //  return s ? -r : r;
2022   //}
2023
2024   SDLoc SL(Op);
2025   SDValue Src = Op.getOperand(0);
2026   SDValue L = Src;
2027
2028   SDValue S;
2029   if (Signed) {
2030     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2031     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2032
2033     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2034     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2035   }
2036
2037   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2038                                    *DAG.getContext(), MVT::f32);
2039
2040
2041   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2042   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2043   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2044   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2045
2046   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2047   SDValue E = DAG.getSelect(SL, MVT::i32,
2048     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2049     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2050     ZeroI32);
2051
2052   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2053     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2054     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2055
2056   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2057                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2058
2059   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2060                              U, DAG.getConstant(40, SL, MVT::i64));
2061
2062   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2063     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2064     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
2065
2066   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2067   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2068   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2069
2070   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2071
2072   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2073
2074   SDValue R = DAG.getSelect(SL, MVT::i32,
2075     RCmp,
2076     One,
2077     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2078   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2079   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2080
2081   if (!Signed)
2082     return R;
2083
2084   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2085   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2086 }
2087
2088 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2089                                                bool Signed) const {
2090   SDLoc SL(Op);
2091   SDValue Src = Op.getOperand(0);
2092
2093   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2094
2095   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2096                            DAG.getConstant(0, SL, MVT::i32));
2097   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2098                            DAG.getConstant(1, SL, MVT::i32));
2099
2100   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2101                               SL, MVT::f64, Hi);
2102
2103   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2104
2105   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2106                               DAG.getConstant(32, SL, MVT::i32));
2107   // TODO: Should this propagate fast-math-flags?
2108   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2109 }
2110
2111 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2112                                                SelectionDAG &DAG) const {
2113   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2114          "operation should be legal");
2115
2116   // TODO: Factor out code common with LowerSINT_TO_FP.
2117
2118   EVT DestVT = Op.getValueType();
2119   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2120     SDLoc DL(Op);
2121     SDValue Src = Op.getOperand(0);
2122
2123     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2124     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2125     SDValue FPRound =
2126         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2127
2128     return FPRound;
2129   }
2130
2131   if (DestVT == MVT::f32)
2132     return LowerINT_TO_FP32(Op, DAG, false);
2133
2134   assert(DestVT == MVT::f64);
2135   return LowerINT_TO_FP64(Op, DAG, false);
2136 }
2137
2138 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2139                                               SelectionDAG &DAG) const {
2140   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2141          "operation should be legal");
2142
2143   // TODO: Factor out code common with LowerUINT_TO_FP.
2144
2145   EVT DestVT = Op.getValueType();
2146   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2147     SDLoc DL(Op);
2148     SDValue Src = Op.getOperand(0);
2149
2150     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2151     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2152     SDValue FPRound =
2153         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2154
2155     return FPRound;
2156   }
2157
2158   if (DestVT == MVT::f32)
2159     return LowerINT_TO_FP32(Op, DAG, true);
2160
2161   assert(DestVT == MVT::f64);
2162   return LowerINT_TO_FP64(Op, DAG, true);
2163 }
2164
2165 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2166                                                bool Signed) const {
2167   SDLoc SL(Op);
2168
2169   SDValue Src = Op.getOperand(0);
2170
2171   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2172
2173   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2174                                  MVT::f64);
2175   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2176                                  MVT::f64);
2177   // TODO: Should this propagate fast-math-flags?
2178   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2179
2180   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2181
2182
2183   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2184
2185   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2186                            MVT::i32, FloorMul);
2187   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2188
2189   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2190
2191   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2192 }
2193
2194 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2195   SDLoc DL(Op);
2196   SDValue N0 = Op.getOperand(0);
2197
2198   // Convert to target node to get known bits
2199   if (N0.getValueType() == MVT::f32)
2200     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2201
2202   if (getTargetMachine().Options.UnsafeFPMath) {
2203     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2204     return SDValue();
2205   }
2206
2207   assert(N0.getSimpleValueType() == MVT::f64);
2208
2209   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2210   const unsigned ExpMask = 0x7ff;
2211   const unsigned ExpBiasf64 = 1023;
2212   const unsigned ExpBiasf16 = 15;
2213   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2214   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2215   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2216   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2217                            DAG.getConstant(32, DL, MVT::i64));
2218   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2219   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2220   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2221                           DAG.getConstant(20, DL, MVT::i64));
2222   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2223                   DAG.getConstant(ExpMask, DL, MVT::i32));
2224   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2225   // add the f16 bias (15) to get the biased exponent for the f16 format.
2226   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2227                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2228
2229   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2230                           DAG.getConstant(8, DL, MVT::i32));
2231   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2232                   DAG.getConstant(0xffe, DL, MVT::i32));
2233
2234   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2235                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2236   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2237
2238   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2239   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2240
2241   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2242   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2243       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2244                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2245
2246   // N = M | (E << 12);
2247   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2248       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2249                   DAG.getConstant(12, DL, MVT::i32)));
2250
2251   // B = clamp(1-E, 0, 13);
2252   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2253                                   One, E);
2254   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2255   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2256                   DAG.getConstant(13, DL, MVT::i32));
2257
2258   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2259                                    DAG.getConstant(0x1000, DL, MVT::i32));
2260
2261   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2262   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2263   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2264   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2265
2266   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2267   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2268                               DAG.getConstant(0x7, DL, MVT::i32));
2269   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2270                   DAG.getConstant(2, DL, MVT::i32));
2271   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2272                                One, Zero, ISD::SETEQ);
2273   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2274                                One, Zero, ISD::SETGT);
2275   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2276   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2277
2278   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2279                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2280   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2281                       I, V, ISD::SETEQ);
2282
2283   // Extract the sign bit.
2284   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2285                             DAG.getConstant(16, DL, MVT::i32));
2286   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2287                      DAG.getConstant(0x8000, DL, MVT::i32));
2288
2289   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2290   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2291 }
2292
2293 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2294                                               SelectionDAG &DAG) const {
2295   SDValue Src = Op.getOperand(0);
2296
2297   // TODO: Factor out code common with LowerFP_TO_UINT.
2298
2299   EVT SrcVT = Src.getValueType();
2300   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2301     SDLoc DL(Op);
2302
2303     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2304     SDValue FpToInt32 =
2305         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2306
2307     return FpToInt32;
2308   }
2309
2310   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2311     return LowerFP64_TO_INT(Op, DAG, true);
2312
2313   return SDValue();
2314 }
2315
2316 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2317                                               SelectionDAG &DAG) const {
2318   SDValue Src = Op.getOperand(0);
2319
2320   // TODO: Factor out code common with LowerFP_TO_SINT.
2321
2322   EVT SrcVT = Src.getValueType();
2323   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2324     SDLoc DL(Op);
2325
2326     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2327     SDValue FpToInt32 =
2328         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2329
2330     return FpToInt32;
2331   }
2332
2333   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2334     return LowerFP64_TO_INT(Op, DAG, false);
2335
2336   return SDValue();
2337 }
2338
2339 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2340                                                      SelectionDAG &DAG) const {
2341   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2342   MVT VT = Op.getSimpleValueType();
2343   MVT ScalarVT = VT.getScalarType();
2344
2345   assert(VT.isVector());
2346
2347   SDValue Src = Op.getOperand(0);
2348   SDLoc DL(Op);
2349
2350   // TODO: Don't scalarize on Evergreen?
2351   unsigned NElts = VT.getVectorNumElements();
2352   SmallVector<SDValue, 8> Args;
2353   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2354
2355   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2356   for (unsigned I = 0; I < NElts; ++I)
2357     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2358
2359   return DAG.getBuildVector(VT, DL, Args);
2360 }
2361
2362 //===----------------------------------------------------------------------===//
2363 // Custom DAG optimizations
2364 //===----------------------------------------------------------------------===//
2365
2366 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2367   KnownBits Known;
2368   EVT VT = Op.getValueType();
2369   DAG.computeKnownBits(Op, Known);
2370
2371   return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
2372 }
2373
2374 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2375   EVT VT = Op.getValueType();
2376
2377   // In order for this to be a signed 24-bit value, bit 23, must
2378   // be a sign bit.
2379   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2380                                      // as unsigned 24-bit values.
2381          (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
2382 }
2383
2384 static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
2385                         TargetLowering::DAGCombinerInfo &DCI) {
2386
2387   SelectionDAG &DAG = DCI.DAG;
2388   SDValue Op = Node24->getOperand(OpIdx);
2389   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2390   EVT VT = Op.getValueType();
2391
2392   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
2393   APInt KnownZero, KnownOne;
2394   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
2395   if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
2396     return true;
2397
2398   return false;
2399 }
2400
2401 template <typename IntTy>
2402 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2403                                uint32_t Width, const SDLoc &DL) {
2404   if (Width + Offset < 32) {
2405     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2406     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2407     return DAG.getConstant(Result, DL, MVT::i32);
2408   }
2409
2410   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2411 }
2412
2413 static bool hasVolatileUser(SDNode *Val) {
2414   for (SDNode *U : Val->uses()) {
2415     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2416       if (M->isVolatile())
2417         return true;
2418     }
2419   }
2420
2421   return false;
2422 }
2423
2424 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2425   // i32 vectors are the canonical memory type.
2426   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2427     return false;
2428
2429   if (!VT.isByteSized())
2430     return false;
2431
2432   unsigned Size = VT.getStoreSize();
2433
2434   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2435     return false;
2436
2437   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2438     return false;
2439
2440   return true;
2441 }
2442
2443 // Replace load of an illegal type with a store of a bitcast to a friendlier
2444 // type.
2445 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2446                                                  DAGCombinerInfo &DCI) const {
2447   if (!DCI.isBeforeLegalize())
2448     return SDValue();
2449
2450   LoadSDNode *LN = cast<LoadSDNode>(N);
2451   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2452     return SDValue();
2453
2454   SDLoc SL(N);
2455   SelectionDAG &DAG = DCI.DAG;
2456   EVT VT = LN->getMemoryVT();
2457
2458   unsigned Size = VT.getStoreSize();
2459   unsigned Align = LN->getAlignment();
2460   if (Align < Size && isTypeLegal(VT)) {
2461     bool IsFast;
2462     unsigned AS = LN->getAddressSpace();
2463
2464     // Expand unaligned loads earlier than legalization. Due to visitation order
2465     // problems during legalization, the emitted instructions to pack and unpack
2466     // the bytes again are not eliminated in the case of an unaligned copy.
2467     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2468       if (VT.isVector())
2469         return scalarizeVectorLoad(LN, DAG);
2470
2471       SDValue Ops[2];
2472       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2473       return DAG.getMergeValues(Ops, SDLoc(N));
2474     }
2475
2476     if (!IsFast)
2477       return SDValue();
2478   }
2479
2480   if (!shouldCombineMemoryType(VT))
2481     return SDValue();
2482
2483   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2484
2485   SDValue NewLoad
2486     = DAG.getLoad(NewVT, SL, LN->getChain(),
2487                   LN->getBasePtr(), LN->getMemOperand());
2488
2489   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2490   DCI.CombineTo(N, BC, NewLoad.getValue(1));
2491   return SDValue(N, 0);
2492 }
2493
2494 // Replace store of an illegal type with a store of a bitcast to a friendlier
2495 // type.
2496 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2497                                                   DAGCombinerInfo &DCI) const {
2498   if (!DCI.isBeforeLegalize())
2499     return SDValue();
2500
2501   StoreSDNode *SN = cast<StoreSDNode>(N);
2502   if (SN->isVolatile() || !ISD::isNormalStore(SN))
2503     return SDValue();
2504
2505   EVT VT = SN->getMemoryVT();
2506   unsigned Size = VT.getStoreSize();
2507
2508   SDLoc SL(N);
2509   SelectionDAG &DAG = DCI.DAG;
2510   unsigned Align = SN->getAlignment();
2511   if (Align < Size && isTypeLegal(VT)) {
2512     bool IsFast;
2513     unsigned AS = SN->getAddressSpace();
2514
2515     // Expand unaligned stores earlier than legalization. Due to visitation
2516     // order problems during legalization, the emitted instructions to pack and
2517     // unpack the bytes again are not eliminated in the case of an unaligned
2518     // copy.
2519     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2520       if (VT.isVector())
2521         return scalarizeVectorStore(SN, DAG);
2522
2523       return expandUnalignedStore(SN, DAG);
2524     }
2525
2526     if (!IsFast)
2527       return SDValue();
2528   }
2529
2530   if (!shouldCombineMemoryType(VT))
2531     return SDValue();
2532
2533   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2534   SDValue Val = SN->getValue();
2535
2536   //DCI.AddToWorklist(Val.getNode());
2537
2538   bool OtherUses = !Val.hasOneUse();
2539   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2540   if (OtherUses) {
2541     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2542     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2543   }
2544
2545   return DAG.getStore(SN->getChain(), SL, CastVal,
2546                       SN->getBasePtr(), SN->getMemOperand());
2547 }
2548
2549 SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
2550                                                   DAGCombinerInfo &DCI) const {
2551   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
2552   if (!CSrc)
2553     return SDValue();
2554
2555   const APFloat &F = CSrc->getValueAPF();
2556   APFloat Zero = APFloat::getZero(F.getSemantics());
2557   APFloat::cmpResult Cmp0 = F.compare(Zero);
2558   if (Cmp0 == APFloat::cmpLessThan ||
2559       (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
2560     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
2561   }
2562
2563   APFloat One(F.getSemantics(), "1.0");
2564   APFloat::cmpResult Cmp1 = F.compare(One);
2565   if (Cmp1 == APFloat::cmpGreaterThan)
2566     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
2567
2568   return SDValue(CSrc, 0);
2569 }
2570
2571 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2572 /// binary operation \p Opc to it with the corresponding constant operands.
2573 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2574   DAGCombinerInfo &DCI, const SDLoc &SL,
2575   unsigned Opc, SDValue LHS,
2576   uint32_t ValLo, uint32_t ValHi) const {
2577   SelectionDAG &DAG = DCI.DAG;
2578   SDValue Lo, Hi;
2579   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2580
2581   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2582   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
2583
2584   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
2585   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
2586
2587   // Re-visit the ands. It's possible we eliminated one of them and it could
2588   // simplify the vector.
2589   DCI.AddToWorklist(Lo.getNode());
2590   DCI.AddToWorklist(Hi.getNode());
2591
2592   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2593   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2594 }
2595
2596 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
2597                                                 DAGCombinerInfo &DCI) const {
2598   EVT VT = N->getValueType(0);
2599   if (VT != MVT::i64)
2600     return SDValue();
2601
2602   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2603   if (!RHS)
2604     return SDValue();
2605
2606   SDValue LHS = N->getOperand(0);
2607   unsigned RHSVal = RHS->getZExtValue();
2608   if (!RHSVal)
2609     return LHS;
2610
2611   SDLoc SL(N);
2612   SelectionDAG &DAG = DCI.DAG;
2613
2614   switch (LHS->getOpcode()) {
2615   default:
2616     break;
2617   case ISD::ZERO_EXTEND:
2618   case ISD::SIGN_EXTEND:
2619   case ISD::ANY_EXTEND: {
2620     // shl (ext x) => zext (shl x), if shift does not overflow int
2621     KnownBits Known;
2622     SDValue X = LHS->getOperand(0);
2623     DAG.computeKnownBits(X, Known);
2624     unsigned LZ = Known.countMinLeadingZeros();
2625     if (LZ < RHSVal)
2626       break;
2627     EVT XVT = X.getValueType();
2628     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
2629     return DAG.getZExtOrTrunc(Shl, SL, VT);
2630   }
2631   }
2632
2633   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2634
2635   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
2636   // common case, splitting this into a move and a 32-bit shift is faster and
2637   // the same code size.
2638   if (RHSVal < 32)
2639     return SDValue();
2640
2641   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
2642
2643   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
2644   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
2645
2646   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2647
2648   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
2649   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2650 }
2651
2652 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
2653                                                 DAGCombinerInfo &DCI) const {
2654   if (N->getValueType(0) != MVT::i64)
2655     return SDValue();
2656
2657   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2658   if (!RHS)
2659     return SDValue();
2660
2661   SelectionDAG &DAG = DCI.DAG;
2662   SDLoc SL(N);
2663   unsigned RHSVal = RHS->getZExtValue();
2664
2665   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
2666   if (RHSVal == 32) {
2667     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2668     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2669                                    DAG.getConstant(31, SL, MVT::i32));
2670
2671     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
2672     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2673   }
2674
2675   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
2676   if (RHSVal == 63) {
2677     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2678     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2679                                    DAG.getConstant(31, SL, MVT::i32));
2680     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
2681     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2682   }
2683
2684   return SDValue();
2685 }
2686
2687 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
2688                                                 DAGCombinerInfo &DCI) const {
2689   if (N->getValueType(0) != MVT::i64)
2690     return SDValue();
2691
2692   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2693   if (!RHS)
2694     return SDValue();
2695
2696   unsigned ShiftAmt = RHS->getZExtValue();
2697   if (ShiftAmt < 32)
2698     return SDValue();
2699
2700   // srl i64:x, C for C >= 32
2701   // =>
2702   //   build_pair (srl hi_32(x), C - 32), 0
2703
2704   SelectionDAG &DAG = DCI.DAG;
2705   SDLoc SL(N);
2706
2707   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2708   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2709
2710   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
2711   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
2712                            VecOp, One);
2713
2714   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
2715   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
2716
2717   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
2718
2719   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
2720 }
2721
2722 // We need to specifically handle i64 mul here to avoid unnecessary conversion
2723 // instructions. If we only match on the legalized i64 mul expansion,
2724 // SimplifyDemandedBits will be unable to remove them because there will be
2725 // multiple uses due to the separate mul + mulh[su].
2726 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
2727                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
2728   if (Size <= 32) {
2729     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2730     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
2731   }
2732
2733   // Because we want to eliminate extension instructions before the
2734   // operation, we need to create a single user here (i.e. not the separate
2735   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
2736
2737   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
2738
2739   SDValue Mul = DAG.getNode(MulOpc, SL,
2740                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
2741
2742   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
2743                      Mul.getValue(0), Mul.getValue(1));
2744 }
2745
2746 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
2747                                                 DAGCombinerInfo &DCI) const {
2748   EVT VT = N->getValueType(0);
2749
2750   unsigned Size = VT.getSizeInBits();
2751   if (VT.isVector() || Size > 64)
2752     return SDValue();
2753
2754   // There are i16 integer mul/mad.
2755   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
2756     return SDValue();
2757
2758   SelectionDAG &DAG = DCI.DAG;
2759   SDLoc DL(N);
2760
2761   SDValue N0 = N->getOperand(0);
2762   SDValue N1 = N->getOperand(1);
2763   SDValue Mul;
2764
2765   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
2766     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2767     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2768     Mul = getMul24(DAG, DL, N0, N1, Size, false);
2769   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
2770     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2771     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2772     Mul = getMul24(DAG, DL, N0, N1, Size, true);
2773   } else {
2774     return SDValue();
2775   }
2776
2777   // We need to use sext even for MUL_U24, because MUL_U24 is used
2778   // for signed multiply of 8 and 16-bit types.
2779   return DAG.getSExtOrTrunc(Mul, DL, VT);
2780 }
2781
2782 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
2783                                                   DAGCombinerInfo &DCI) const {
2784   EVT VT = N->getValueType(0);
2785
2786   if (!Subtarget->hasMulI24() || VT.isVector())
2787     return SDValue();
2788
2789   SelectionDAG &DAG = DCI.DAG;
2790   SDLoc DL(N);
2791
2792   SDValue N0 = N->getOperand(0);
2793   SDValue N1 = N->getOperand(1);
2794
2795   if (!isI24(N0, DAG) || !isI24(N1, DAG))
2796     return SDValue();
2797
2798   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2799   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2800
2801   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
2802   DCI.AddToWorklist(Mulhi.getNode());
2803   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
2804 }
2805
2806 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
2807                                                   DAGCombinerInfo &DCI) const {
2808   EVT VT = N->getValueType(0);
2809
2810   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
2811     return SDValue();
2812
2813   SelectionDAG &DAG = DCI.DAG;
2814   SDLoc DL(N);
2815
2816   SDValue N0 = N->getOperand(0);
2817   SDValue N1 = N->getOperand(1);
2818
2819   if (!isU24(N0, DAG) || !isU24(N1, DAG))
2820     return SDValue();
2821
2822   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2823   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2824
2825   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
2826   DCI.AddToWorklist(Mulhi.getNode());
2827   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
2828 }
2829
2830 SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
2831   SDNode *N, DAGCombinerInfo &DCI) const {
2832   SelectionDAG &DAG = DCI.DAG;
2833
2834   // Simplify demanded bits before splitting into multiple users.
2835   if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
2836     return SDValue();
2837
2838   SDValue N0 = N->getOperand(0);
2839   SDValue N1 = N->getOperand(1);
2840
2841   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
2842
2843   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2844   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
2845
2846   SDLoc SL(N);
2847
2848   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
2849   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
2850   return DAG.getMergeValues({ MulLo, MulHi }, SL);
2851 }
2852
2853 static bool isNegativeOne(SDValue Val) {
2854   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
2855     return C->isAllOnesValue();
2856   return false;
2857 }
2858
2859 static bool isCtlzOpc(unsigned Opc) {
2860   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2861 }
2862
2863 SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
2864                                           SDValue Op,
2865                                           const SDLoc &DL) const {
2866   EVT VT = Op.getValueType();
2867   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
2868   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
2869                               LegalVT != MVT::i16))
2870     return SDValue();
2871
2872   if (VT != MVT::i32)
2873     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
2874
2875   SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
2876   if (VT != MVT::i32)
2877     FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
2878
2879   return FFBH;
2880 }
2881
2882 // The native instructions return -1 on 0 input. Optimize out a select that
2883 // produces -1 on 0.
2884 //
2885 // TODO: If zero is not undef, we could also do this if the output is compared
2886 // against the bitwidth.
2887 //
2888 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
2889 SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
2890                                                  SDValue LHS, SDValue RHS,
2891                                                  DAGCombinerInfo &DCI) const {
2892   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
2893   if (!CmpRhs || !CmpRhs->isNullValue())
2894     return SDValue();
2895
2896   SelectionDAG &DAG = DCI.DAG;
2897   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2898   SDValue CmpLHS = Cond.getOperand(0);
2899
2900   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
2901   if (CCOpcode == ISD::SETEQ &&
2902       isCtlzOpc(RHS.getOpcode()) &&
2903       RHS.getOperand(0) == CmpLHS &&
2904       isNegativeOne(LHS)) {
2905     return getFFBH_U32(DAG, CmpLHS, SL);
2906   }
2907
2908   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
2909   if (CCOpcode == ISD::SETNE &&
2910       isCtlzOpc(LHS.getOpcode()) &&
2911       LHS.getOperand(0) == CmpLHS &&
2912       isNegativeOne(RHS)) {
2913     return getFFBH_U32(DAG, CmpLHS, SL);
2914   }
2915
2916   return SDValue();
2917 }
2918
2919 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
2920                                          unsigned Op,
2921                                          const SDLoc &SL,
2922                                          SDValue Cond,
2923                                          SDValue N1,
2924                                          SDValue N2) {
2925   SelectionDAG &DAG = DCI.DAG;
2926   EVT VT = N1.getValueType();
2927
2928   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
2929                                   N1.getOperand(0), N2.getOperand(0));
2930   DCI.AddToWorklist(NewSelect.getNode());
2931   return DAG.getNode(Op, SL, VT, NewSelect);
2932 }
2933
2934 // Pull a free FP operation out of a select so it may fold into uses.
2935 //
2936 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
2937 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
2938 //
2939 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
2940 // select c, (fabs x), +k -> fabs (select c, x, k)
2941 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
2942                                     SDValue N) {
2943   SelectionDAG &DAG = DCI.DAG;
2944   SDValue Cond = N.getOperand(0);
2945   SDValue LHS = N.getOperand(1);
2946   SDValue RHS = N.getOperand(2);
2947
2948   EVT VT = N.getValueType();
2949   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
2950       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
2951     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
2952                                      SDLoc(N), Cond, LHS, RHS);
2953   }
2954
2955   bool Inv = false;
2956   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
2957     std::swap(LHS, RHS);
2958     Inv = true;
2959   }
2960
2961   // TODO: Support vector constants.
2962   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
2963   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
2964     SDLoc SL(N);
2965     // If one side is an fneg/fabs and the other is a constant, we can push the
2966     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
2967     SDValue NewLHS = LHS.getOperand(0);
2968     SDValue NewRHS = RHS;
2969
2970     // Careful: if the neg can be folded up, don't try to pull it back down.
2971     bool ShouldFoldNeg = true;
2972
2973     if (NewLHS.hasOneUse()) {
2974       unsigned Opc = NewLHS.getOpcode();
2975       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
2976         ShouldFoldNeg = false;
2977       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
2978         ShouldFoldNeg = false;
2979     }
2980
2981     if (ShouldFoldNeg) {
2982       if (LHS.getOpcode() == ISD::FNEG)
2983         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2984       else if (CRHS->isNegative())
2985         return SDValue();
2986
2987       if (Inv)
2988         std::swap(NewLHS, NewRHS);
2989
2990       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
2991                                       Cond, NewLHS, NewRHS);
2992       DCI.AddToWorklist(NewSelect.getNode());
2993       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
2994     }
2995   }
2996
2997   return SDValue();
2998 }
2999
3000
3001 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3002                                                    DAGCombinerInfo &DCI) const {
3003   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3004     return Folded;
3005
3006   SDValue Cond = N->getOperand(0);
3007   if (Cond.getOpcode() != ISD::SETCC)
3008     return SDValue();
3009
3010   EVT VT = N->getValueType(0);
3011   SDValue LHS = Cond.getOperand(0);
3012   SDValue RHS = Cond.getOperand(1);
3013   SDValue CC = Cond.getOperand(2);
3014
3015   SDValue True = N->getOperand(1);
3016   SDValue False = N->getOperand(2);
3017
3018   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3019     SelectionDAG &DAG = DCI.DAG;
3020     if ((DAG.isConstantValueOfAnyType(True) ||
3021          DAG.isConstantValueOfAnyType(True)) &&
3022         (!DAG.isConstantValueOfAnyType(False) &&
3023          !DAG.isConstantValueOfAnyType(False))) {
3024       // Swap cmp + select pair to move constant to false input.
3025       // This will allow using VOPC cndmasks more often.
3026       // select (setcc x, y), k, x -> select (setcc y, x) x, x
3027
3028       SDLoc SL(N);
3029       ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
3030                                             LHS.getValueType().isInteger());
3031
3032       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3033       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3034     }
3035
3036     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3037       SDValue MinMax
3038         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3039       // Revisit this node so we can catch min3/max3/med3 patterns.
3040       //DCI.AddToWorklist(MinMax.getNode());
3041       return MinMax;
3042     }
3043   }
3044
3045   // There's no reason to not do this if the condition has other uses.
3046   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
3047 }
3048
3049 static bool isConstantFPZero(SDValue N) {
3050   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
3051     return C->isZero() && !C->isNegative();
3052   return false;
3053 }
3054
3055 static unsigned inverseMinMax(unsigned Opc) {
3056   switch (Opc) {
3057   case ISD::FMAXNUM:
3058     return ISD::FMINNUM;
3059   case ISD::FMINNUM:
3060     return ISD::FMAXNUM;
3061   case AMDGPUISD::FMAX_LEGACY:
3062     return AMDGPUISD::FMIN_LEGACY;
3063   case AMDGPUISD::FMIN_LEGACY:
3064     return  AMDGPUISD::FMAX_LEGACY;
3065   default:
3066     llvm_unreachable("invalid min/max opcode");
3067   }
3068 }
3069
3070 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3071                                                  DAGCombinerInfo &DCI) const {
3072   SelectionDAG &DAG = DCI.DAG;
3073   SDValue N0 = N->getOperand(0);
3074   EVT VT = N->getValueType(0);
3075
3076   unsigned Opc = N0.getOpcode();
3077
3078   // If the input has multiple uses and we can either fold the negate down, or
3079   // the other uses cannot, give up. This both prevents unprofitable
3080   // transformations and infinite loops: we won't repeatedly try to fold around
3081   // a negate that has no 'good' form.
3082   if (N0.hasOneUse()) {
3083     // This may be able to fold into the source, but at a code size cost. Don't
3084     // fold if the fold into the user is free.
3085     if (allUsesHaveSourceMods(N, 0))
3086       return SDValue();
3087   } else {
3088     if (fnegFoldsIntoOp(Opc) &&
3089         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3090       return SDValue();
3091   }
3092
3093   SDLoc SL(N);
3094   switch (Opc) {
3095   case ISD::FADD: {
3096     if (!mayIgnoreSignedZero(N0))
3097       return SDValue();
3098
3099     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3100     SDValue LHS = N0.getOperand(0);
3101     SDValue RHS = N0.getOperand(1);
3102
3103     if (LHS.getOpcode() != ISD::FNEG)
3104       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3105     else
3106       LHS = LHS.getOperand(0);
3107
3108     if (RHS.getOpcode() != ISD::FNEG)
3109       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3110     else
3111       RHS = RHS.getOperand(0);
3112
3113     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3114     if (!N0.hasOneUse())
3115       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3116     return Res;
3117   }
3118   case ISD::FMUL:
3119   case AMDGPUISD::FMUL_LEGACY: {
3120     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3121     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3122     SDValue LHS = N0.getOperand(0);
3123     SDValue RHS = N0.getOperand(1);
3124
3125     if (LHS.getOpcode() == ISD::FNEG)
3126       LHS = LHS.getOperand(0);
3127     else if (RHS.getOpcode() == ISD::FNEG)
3128       RHS = RHS.getOperand(0);
3129     else
3130       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3131
3132     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3133     if (!N0.hasOneUse())
3134       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3135     return Res;
3136   }
3137   case ISD::FMA:
3138   case ISD::FMAD: {
3139     if (!mayIgnoreSignedZero(N0))
3140       return SDValue();
3141
3142     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3143     SDValue LHS = N0.getOperand(0);
3144     SDValue MHS = N0.getOperand(1);
3145     SDValue RHS = N0.getOperand(2);
3146
3147     if (LHS.getOpcode() == ISD::FNEG)
3148       LHS = LHS.getOperand(0);
3149     else if (MHS.getOpcode() == ISD::FNEG)
3150       MHS = MHS.getOperand(0);
3151     else
3152       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3153
3154     if (RHS.getOpcode() != ISD::FNEG)
3155       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3156     else
3157       RHS = RHS.getOperand(0);
3158
3159     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3160     if (!N0.hasOneUse())
3161       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3162     return Res;
3163   }
3164   case ISD::FMAXNUM:
3165   case ISD::FMINNUM:
3166   case AMDGPUISD::FMAX_LEGACY:
3167   case AMDGPUISD::FMIN_LEGACY: {
3168     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3169     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3170     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3171     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3172
3173     SDValue LHS = N0.getOperand(0);
3174     SDValue RHS = N0.getOperand(1);
3175
3176     // 0 doesn't have a negated inline immediate.
3177     // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
3178     // operations.
3179     if (isConstantFPZero(RHS))
3180       return SDValue();
3181
3182     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3183     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3184     unsigned Opposite = inverseMinMax(Opc);
3185
3186     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3187     if (!N0.hasOneUse())
3188       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3189     return Res;
3190   }
3191   case ISD::FP_EXTEND:
3192   case ISD::FTRUNC:
3193   case ISD::FRINT:
3194   case ISD::FNEARBYINT: // XXX - Should fround be handled?
3195   case ISD::FSIN:
3196   case AMDGPUISD::RCP:
3197   case AMDGPUISD::RCP_LEGACY:
3198   case AMDGPUISD::SIN_HW: {
3199     SDValue CvtSrc = N0.getOperand(0);
3200     if (CvtSrc.getOpcode() == ISD::FNEG) {
3201       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3202       // (fneg (rcp (fneg x))) -> (rcp x)
3203       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3204     }
3205
3206     if (!N0.hasOneUse())
3207       return SDValue();
3208
3209     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3210     // (fneg (rcp x)) -> (rcp (fneg x))
3211     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3212     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3213   }
3214   case ISD::FP_ROUND: {
3215     SDValue CvtSrc = N0.getOperand(0);
3216
3217     if (CvtSrc.getOpcode() == ISD::FNEG) {
3218       // (fneg (fp_round (fneg x))) -> (fp_round x)
3219       return DAG.getNode(ISD::FP_ROUND, SL, VT,
3220                          CvtSrc.getOperand(0), N0.getOperand(1));
3221     }
3222
3223     if (!N0.hasOneUse())
3224       return SDValue();
3225
3226     // (fneg (fp_round x)) -> (fp_round (fneg x))
3227     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3228     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3229   }
3230   case ISD::FP16_TO_FP: {
3231     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3232     // f16, but legalization of f16 fneg ends up pulling it out of the source.
3233     // Put the fneg back as a legal source operation that can be matched later.
3234     SDLoc SL(N);
3235
3236     SDValue Src = N0.getOperand(0);
3237     EVT SrcVT = Src.getValueType();
3238
3239     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3240     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3241                                   DAG.getConstant(0x8000, SL, SrcVT));
3242     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3243   }
3244   default:
3245     return SDValue();
3246   }
3247 }
3248
3249 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3250                                                  DAGCombinerInfo &DCI) const {
3251   SelectionDAG &DAG = DCI.DAG;
3252   SDValue N0 = N->getOperand(0);
3253
3254   if (!N0.hasOneUse())
3255     return SDValue();
3256
3257   switch (N0.getOpcode()) {
3258   case ISD::FP16_TO_FP: {
3259     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3260     SDLoc SL(N);
3261     SDValue Src = N0.getOperand(0);
3262     EVT SrcVT = Src.getValueType();
3263
3264     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3265     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3266                                   DAG.getConstant(0x7fff, SL, SrcVT));
3267     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3268   }
3269   default:
3270     return SDValue();
3271   }
3272 }
3273
3274 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3275                                                 DAGCombinerInfo &DCI) const {
3276   SelectionDAG &DAG = DCI.DAG;
3277   SDLoc DL(N);
3278
3279   switch(N->getOpcode()) {
3280   default:
3281     break;
3282   case ISD::BITCAST: {
3283     EVT DestVT = N->getValueType(0);
3284
3285     // Push casts through vector builds. This helps avoid emitting a large
3286     // number of copies when materializing floating point vector constants.
3287     //
3288     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3289     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3290     if (DestVT.isVector()) {
3291       SDValue Src = N->getOperand(0);
3292       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3293         EVT SrcVT = Src.getValueType();
3294         unsigned NElts = DestVT.getVectorNumElements();
3295
3296         if (SrcVT.getVectorNumElements() == NElts) {
3297           EVT DestEltVT = DestVT.getVectorElementType();
3298
3299           SmallVector<SDValue, 8> CastedElts;
3300           SDLoc SL(N);
3301           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3302             SDValue Elt = Src.getOperand(I);
3303             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3304           }
3305
3306           return DAG.getBuildVector(DestVT, SL, CastedElts);
3307         }
3308       }
3309     }
3310
3311     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3312       break;
3313
3314     // Fold bitcasts of constants.
3315     //
3316     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3317     // TODO: Generalize and move to DAGCombiner
3318     SDValue Src = N->getOperand(0);
3319     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3320       assert(Src.getValueType() == MVT::i64);
3321       SDLoc SL(N);
3322       uint64_t CVal = C->getZExtValue();
3323       return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3324                          DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3325                          DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3326     }
3327
3328     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3329       const APInt &Val = C->getValueAPF().bitcastToAPInt();
3330       SDLoc SL(N);
3331       uint64_t CVal = Val.getZExtValue();
3332       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3333                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3334                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3335
3336       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3337     }
3338
3339     break;
3340   }
3341   case ISD::SHL: {
3342     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3343       break;
3344
3345     return performShlCombine(N, DCI);
3346   }
3347   case ISD::SRL: {
3348     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3349       break;
3350
3351     return performSrlCombine(N, DCI);
3352   }
3353   case ISD::SRA: {
3354     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3355       break;
3356
3357     return performSraCombine(N, DCI);
3358   }
3359   case ISD::MUL:
3360     return performMulCombine(N, DCI);
3361   case ISD::MULHS:
3362     return performMulhsCombine(N, DCI);
3363   case ISD::MULHU:
3364     return performMulhuCombine(N, DCI);
3365   case AMDGPUISD::MUL_I24:
3366   case AMDGPUISD::MUL_U24:
3367   case AMDGPUISD::MULHI_I24:
3368   case AMDGPUISD::MULHI_U24: {
3369     // If the first call to simplify is successfull, then N may end up being
3370     // deleted, so we shouldn't call simplifyI24 again.
3371     simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
3372     return SDValue();
3373   }
3374   case AMDGPUISD::MUL_LOHI_I24:
3375   case AMDGPUISD::MUL_LOHI_U24:
3376     return performMulLoHi24Combine(N, DCI);
3377   case ISD::SELECT:
3378     return performSelectCombine(N, DCI);
3379   case ISD::FNEG:
3380     return performFNegCombine(N, DCI);
3381   case ISD::FABS:
3382     return performFAbsCombine(N, DCI);
3383   case AMDGPUISD::BFE_I32:
3384   case AMDGPUISD::BFE_U32: {
3385     assert(!N->getValueType(0).isVector() &&
3386            "Vector handling of BFE not implemented");
3387     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3388     if (!Width)
3389       break;
3390
3391     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3392     if (WidthVal == 0)
3393       return DAG.getConstant(0, DL, MVT::i32);
3394
3395     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3396     if (!Offset)
3397       break;
3398
3399     SDValue BitsFrom = N->getOperand(0);
3400     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
3401
3402     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
3403
3404     if (OffsetVal == 0) {
3405       // This is already sign / zero extended, so try to fold away extra BFEs.
3406       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
3407
3408       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
3409       if (OpSignBits >= SignBits)
3410         return BitsFrom;
3411
3412       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
3413       if (Signed) {
3414         // This is a sign_extend_inreg. Replace it to take advantage of existing
3415         // DAG Combines. If not eliminated, we will match back to BFE during
3416         // selection.
3417
3418         // TODO: The sext_inreg of extended types ends, although we can could
3419         // handle them in a single BFE.
3420         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
3421                            DAG.getValueType(SmallVT));
3422       }
3423
3424       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
3425     }
3426
3427     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
3428       if (Signed) {
3429         return constantFoldBFE<int32_t>(DAG,
3430                                         CVal->getSExtValue(),
3431                                         OffsetVal,
3432                                         WidthVal,
3433                                         DL);
3434       }
3435
3436       return constantFoldBFE<uint32_t>(DAG,
3437                                        CVal->getZExtValue(),
3438                                        OffsetVal,
3439                                        WidthVal,
3440                                        DL);
3441     }
3442
3443     if ((OffsetVal + WidthVal) >= 32) {
3444       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
3445       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
3446                          BitsFrom, ShiftVal);
3447     }
3448
3449     if (BitsFrom.hasOneUse()) {
3450       APInt Demanded = APInt::getBitsSet(32,
3451                                          OffsetVal,
3452                                          OffsetVal + WidthVal);
3453
3454       KnownBits Known;
3455       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
3456                                             !DCI.isBeforeLegalizeOps());
3457       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3458       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
3459           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
3460         DCI.CommitTargetLoweringOpt(TLO);
3461       }
3462     }
3463
3464     break;
3465   }
3466   case ISD::LOAD:
3467     return performLoadCombine(N, DCI);
3468   case ISD::STORE:
3469     return performStoreCombine(N, DCI);
3470   case AMDGPUISD::CLAMP:
3471     return performClampCombine(N, DCI);
3472   case AMDGPUISD::RCP: {
3473     if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
3474       // XXX - Should this flush denormals?
3475       const APFloat &Val = CFP->getValueAPF();
3476       APFloat One(Val.getSemantics(), "1.0");
3477       return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3478     }
3479
3480     break;
3481   }
3482   }
3483   return SDValue();
3484 }
3485
3486 //===----------------------------------------------------------------------===//
3487 // Helper functions
3488 //===----------------------------------------------------------------------===//
3489
3490 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3491                                                   const TargetRegisterClass *RC,
3492                                                    unsigned Reg, EVT VT) const {
3493   MachineFunction &MF = DAG.getMachineFunction();
3494   MachineRegisterInfo &MRI = MF.getRegInfo();
3495   unsigned VirtualRegister;
3496   if (!MRI.isLiveIn(Reg)) {
3497     VirtualRegister = MRI.createVirtualRegister(RC);
3498     MRI.addLiveIn(Reg, VirtualRegister);
3499   } else {
3500     VirtualRegister = MRI.getLiveInVirtReg(Reg);
3501   }
3502   return DAG.getRegister(VirtualRegister, VT);
3503 }
3504
3505 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
3506     const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
3507   unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
3508   uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
3509   switch (Param) {
3510   case GRID_DIM:
3511     return ArgOffset;
3512   case GRID_OFFSET:
3513     return ArgOffset + 4;
3514   }
3515   llvm_unreachable("unexpected implicit parameter type");
3516 }
3517
3518 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
3519
3520 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
3521   switch ((AMDGPUISD::NodeType)Opcode) {
3522   case AMDGPUISD::FIRST_NUMBER: break;
3523   // AMDIL DAG nodes
3524   NODE_NAME_CASE(UMUL);
3525   NODE_NAME_CASE(BRANCH_COND);
3526
3527   // AMDGPU DAG nodes
3528   NODE_NAME_CASE(IF)
3529   NODE_NAME_CASE(ELSE)
3530   NODE_NAME_CASE(LOOP)
3531   NODE_NAME_CASE(CALL)
3532   NODE_NAME_CASE(TRAP)
3533   NODE_NAME_CASE(RET_FLAG)
3534   NODE_NAME_CASE(RETURN_TO_EPILOG)
3535   NODE_NAME_CASE(ENDPGM)
3536   NODE_NAME_CASE(DWORDADDR)
3537   NODE_NAME_CASE(FRACT)
3538   NODE_NAME_CASE(SETCC)
3539   NODE_NAME_CASE(SETREG)
3540   NODE_NAME_CASE(FMA_W_CHAIN)
3541   NODE_NAME_CASE(FMUL_W_CHAIN)
3542   NODE_NAME_CASE(CLAMP)
3543   NODE_NAME_CASE(COS_HW)
3544   NODE_NAME_CASE(SIN_HW)
3545   NODE_NAME_CASE(FMAX_LEGACY)
3546   NODE_NAME_CASE(FMIN_LEGACY)
3547   NODE_NAME_CASE(FMAX3)
3548   NODE_NAME_CASE(SMAX3)
3549   NODE_NAME_CASE(UMAX3)
3550   NODE_NAME_CASE(FMIN3)
3551   NODE_NAME_CASE(SMIN3)
3552   NODE_NAME_CASE(UMIN3)
3553   NODE_NAME_CASE(FMED3)
3554   NODE_NAME_CASE(SMED3)
3555   NODE_NAME_CASE(UMED3)
3556   NODE_NAME_CASE(URECIP)
3557   NODE_NAME_CASE(DIV_SCALE)
3558   NODE_NAME_CASE(DIV_FMAS)
3559   NODE_NAME_CASE(DIV_FIXUP)
3560   NODE_NAME_CASE(FMAD_FTZ)
3561   NODE_NAME_CASE(TRIG_PREOP)
3562   NODE_NAME_CASE(RCP)
3563   NODE_NAME_CASE(RSQ)
3564   NODE_NAME_CASE(RCP_LEGACY)
3565   NODE_NAME_CASE(RSQ_LEGACY)
3566   NODE_NAME_CASE(FMUL_LEGACY)
3567   NODE_NAME_CASE(RSQ_CLAMP)
3568   NODE_NAME_CASE(LDEXP)
3569   NODE_NAME_CASE(FP_CLASS)
3570   NODE_NAME_CASE(DOT4)
3571   NODE_NAME_CASE(CARRY)
3572   NODE_NAME_CASE(BORROW)
3573   NODE_NAME_CASE(BFE_U32)
3574   NODE_NAME_CASE(BFE_I32)
3575   NODE_NAME_CASE(BFI)
3576   NODE_NAME_CASE(BFM)
3577   NODE_NAME_CASE(FFBH_U32)
3578   NODE_NAME_CASE(FFBH_I32)
3579   NODE_NAME_CASE(MUL_U24)
3580   NODE_NAME_CASE(MUL_I24)
3581   NODE_NAME_CASE(MULHI_U24)
3582   NODE_NAME_CASE(MULHI_I24)
3583   NODE_NAME_CASE(MUL_LOHI_U24)
3584   NODE_NAME_CASE(MUL_LOHI_I24)
3585   NODE_NAME_CASE(MAD_U24)
3586   NODE_NAME_CASE(MAD_I24)
3587   NODE_NAME_CASE(TEXTURE_FETCH)
3588   NODE_NAME_CASE(EXPORT)
3589   NODE_NAME_CASE(EXPORT_DONE)
3590   NODE_NAME_CASE(R600_EXPORT)
3591   NODE_NAME_CASE(CONST_ADDRESS)
3592   NODE_NAME_CASE(REGISTER_LOAD)
3593   NODE_NAME_CASE(REGISTER_STORE)
3594   NODE_NAME_CASE(SAMPLE)
3595   NODE_NAME_CASE(SAMPLEB)
3596   NODE_NAME_CASE(SAMPLED)
3597   NODE_NAME_CASE(SAMPLEL)
3598   NODE_NAME_CASE(CVT_F32_UBYTE0)
3599   NODE_NAME_CASE(CVT_F32_UBYTE1)
3600   NODE_NAME_CASE(CVT_F32_UBYTE2)
3601   NODE_NAME_CASE(CVT_F32_UBYTE3)
3602   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
3603   NODE_NAME_CASE(FP_TO_FP16)
3604   NODE_NAME_CASE(FP16_ZEXT)
3605   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
3606   NODE_NAME_CASE(CONST_DATA_PTR)
3607   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
3608   NODE_NAME_CASE(KILL)
3609   NODE_NAME_CASE(DUMMY_CHAIN)
3610   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
3611   NODE_NAME_CASE(INIT_EXEC)
3612   NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
3613   NODE_NAME_CASE(SENDMSG)
3614   NODE_NAME_CASE(SENDMSGHALT)
3615   NODE_NAME_CASE(INTERP_MOV)
3616   NODE_NAME_CASE(INTERP_P1)
3617   NODE_NAME_CASE(INTERP_P2)
3618   NODE_NAME_CASE(STORE_MSKOR)
3619   NODE_NAME_CASE(LOAD_CONSTANT)
3620   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
3621   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
3622   NODE_NAME_CASE(ATOMIC_INC)
3623   NODE_NAME_CASE(ATOMIC_DEC)
3624   NODE_NAME_CASE(BUFFER_LOAD)
3625   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
3626   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
3627   }
3628   return nullptr;
3629 }
3630
3631 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
3632                                               SelectionDAG &DAG, int Enabled,
3633                                               int &RefinementSteps,
3634                                               bool &UseOneConstNR,
3635                                               bool Reciprocal) const {
3636   EVT VT = Operand.getValueType();
3637
3638   if (VT == MVT::f32) {
3639     RefinementSteps = 0;
3640     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
3641   }
3642
3643   // TODO: There is also f64 rsq instruction, but the documentation is less
3644   // clear on its precision.
3645
3646   return SDValue();
3647 }
3648
3649 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
3650                                                SelectionDAG &DAG, int Enabled,
3651                                                int &RefinementSteps) const {
3652   EVT VT = Operand.getValueType();
3653
3654   if (VT == MVT::f32) {
3655     // Reciprocal, < 1 ulp error.
3656     //
3657     // This reciprocal approximation converges to < 0.5 ulp error with one
3658     // newton rhapson performed with two fused multiple adds (FMAs).
3659
3660     RefinementSteps = 0;
3661     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
3662   }
3663
3664   // TODO: There is also f64 rcp instruction, but the documentation is less
3665   // clear on its precision.
3666
3667   return SDValue();
3668 }
3669
3670 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
3671     const SDValue Op, KnownBits &Known,
3672     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
3673
3674   Known.resetAll(); // Don't know anything.
3675
3676   KnownBits Known2;
3677   unsigned Opc = Op.getOpcode();
3678
3679   switch (Opc) {
3680   default:
3681     break;
3682   case AMDGPUISD::CARRY:
3683   case AMDGPUISD::BORROW: {
3684     Known.Zero = APInt::getHighBitsSet(32, 31);
3685     break;
3686   }
3687
3688   case AMDGPUISD::BFE_I32:
3689   case AMDGPUISD::BFE_U32: {
3690     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3691     if (!CWidth)
3692       return;
3693
3694     uint32_t Width = CWidth->getZExtValue() & 0x1f;
3695
3696     if (Opc == AMDGPUISD::BFE_U32)
3697       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
3698
3699     break;
3700   }
3701   case AMDGPUISD::FP_TO_FP16:
3702   case AMDGPUISD::FP16_ZEXT: {
3703     unsigned BitWidth = Known.getBitWidth();
3704
3705     // High bits are zero.
3706     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
3707     break;
3708   }
3709   }
3710 }
3711
3712 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
3713     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
3714     unsigned Depth) const {
3715   switch (Op.getOpcode()) {
3716   case AMDGPUISD::BFE_I32: {
3717     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3718     if (!Width)
3719       return 1;
3720
3721     unsigned SignBits = 32 - Width->getZExtValue() + 1;
3722     if (!isNullConstant(Op.getOperand(1)))
3723       return SignBits;
3724
3725     // TODO: Could probably figure something out with non-0 offsets.
3726     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
3727     return std::max(SignBits, Op0SignBits);
3728   }
3729
3730   case AMDGPUISD::BFE_U32: {
3731     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3732     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
3733   }
3734
3735   case AMDGPUISD::CARRY:
3736   case AMDGPUISD::BORROW:
3737     return 31;
3738   case AMDGPUISD::FP_TO_FP16:
3739   case AMDGPUISD::FP16_ZEXT:
3740     return 16;
3741   default:
3742     return 1;
3743   }
3744 }