contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

   1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief This is the parent TargetLowering class for hardware code gen
  12 /// targets.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPUISelLowering.h"
  17 #include "AMDGPU.h"
  18 #include "AMDGPUFrameLowering.h"
  19 #include "AMDGPUIntrinsicInfo.h"
  20 #include "AMDGPURegisterInfo.h"
  21 #include "AMDGPUSubtarget.h"
  22 #include "R600MachineFunctionInfo.h"
  23 #include "SIMachineFunctionInfo.h"
  24 #include "llvm/CodeGen/CallingConvLower.h"
  25 #include "llvm/CodeGen/MachineFunction.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  29 #include "llvm/IR/DataLayout.h"
  30 #include "llvm/IR/DiagnosticInfo.h"
  31 #include "SIInstrInfo.h"
  32 using namespace llvm;
  33
  34 static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
  35                             CCValAssign::LocInfo LocInfo,
  36                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
  37   MachineFunction &MF = State.getMachineFunction();
  38   AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
  39
  40   uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
  41                                          ArgFlags.getOrigAlign());
  42   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
  43   return true;
  44 }
  45
  46 #include "AMDGPUGenCallingConv.inc"
  47
  48 // Find a larger type to do a load / store of a vector with.
  49 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
  50   unsigned StoreSize = VT.getStoreSizeInBits();
  51   if (StoreSize <= 32)
  52     return EVT::getIntegerVT(Ctx, StoreSize);
  53
  54   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
  55   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
  56 }
  57
  58 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
  59                                            const AMDGPUSubtarget &STI)
  60     : TargetLowering(TM), Subtarget(&STI) {
  61   // Lower floating point store/load to integer store/load to reduce the number
  62   // of patterns in tablegen.
  63   setOperationAction(ISD::LOAD, MVT::f32, Promote);
  64   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
  65
  66   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
  67   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
  68
  69   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
  70   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
  71
  72   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
  73   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
  74
  75   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
  76   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
  77
  78   setOperationAction(ISD::LOAD, MVT::i64, Promote);
  79   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
  80
  81   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
  82   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
  83
  84   setOperationAction(ISD::LOAD, MVT::f64, Promote);
  85   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
  86
  87   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
  88   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
  89
  90   // There are no 64-bit extloads. These should be done as a 32-bit extload and
  91   // an extension to 64-bit.
  92   for (MVT VT : MVT::integer_valuetypes()) {
  93     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
  94     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
  95     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
  96   }
  97
  98   for (MVT VT : MVT::integer_valuetypes()) {
  99     if (VT == MVT::i64)
 100       continue;
 101
 102     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 103     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
 104     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
 105     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
 106
 107     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 108     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
 109     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
 110     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
 111
 112     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 113     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
 114     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
 115     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
 116   }
 117
 118   for (MVT VT : MVT::integer_vector_valuetypes()) {
 119     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
 120     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
 121     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
 122     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
 123     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
 124     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
 125     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
 126     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
 127     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
 128     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
 129     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
 130     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
 131   }
 132
 133   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 134   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
 135   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
 136   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
 137
 138   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 139   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
 140   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
 141   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
 142
 143   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 144   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
 145   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
 146   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
 147
 148   setOperationAction(ISD::STORE, MVT::f32, Promote);
 149   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 150
 151   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
 152   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
 153
 154   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
 155   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 156
 157   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
 158   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
 159
 160   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
 161   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
 162
 163   setOperationAction(ISD::STORE, MVT::i64, Promote);
 164   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
 165
 166   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
 167   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
 168
 169   setOperationAction(ISD::STORE, MVT::f64, Promote);
 170   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
 171
 172   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
 173   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
 174
 175   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
 176   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
 177   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 178   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 179
 180   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
 181   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
 182   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
 183   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
 184
 185   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 186   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
 187   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
 188   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
 189
 190   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 191   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 192
 193   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
 194   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 195
 196   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
 197   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 198
 199   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
 200   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 201
 202
 203   setOperationAction(ISD::Constant, MVT::i32, Legal);
 204   setOperationAction(ISD::Constant, MVT::i64, Legal);
 205   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 206   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 207
 208   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 209   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 210
 211   // This is totally unsupported, just custom lower to produce an error.
 212   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 213
 214   // We need to custom lower some of the intrinsics
 215   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 216   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 217
 218   // Library functions.  These default to Expand, but we have instructions
 219   // for them.
 220   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
 221   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
 222   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
 223   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
 224   setOperationAction(ISD::FABS,   MVT::f32, Legal);
 225   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
 226   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
 227   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 228   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
 229   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
 230
 231   setOperationAction(ISD::FROUND, MVT::f32, Custom);
 232   setOperationAction(ISD::FROUND, MVT::f64, Custom);
 233
 234   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
 235   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
 236
 237   setOperationAction(ISD::FREM, MVT::f32, Custom);
 238   setOperationAction(ISD::FREM, MVT::f64, Custom);
 239
 240   // v_mad_f32 does not support denormals according to some sources.
 241   if (!Subtarget->hasFP32Denormals())
 242     setOperationAction(ISD::FMAD, MVT::f32, Legal);
 243
 244   // Expand to fneg + fadd.
 245   setOperationAction(ISD::FSUB, MVT::f64, Expand);
 246
 247   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
 248   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
 249   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
 250   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
 251   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
 252   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
 253   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
 254   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
 255   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
 256   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
 257
 258   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
 259     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
 260     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
 261     setOperationAction(ISD::FRINT, MVT::f64, Custom);
 262     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
 263   }
 264
 265   if (!Subtarget->hasBFI()) {
 266     // fcopysign can be done in a single instruction with BFI.
 267     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 268     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 269   }
 270
 271   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 272   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
 273
 274   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 275   for (MVT VT : ScalarIntVTs) {
 276     // These should use [SU]DIVREM, so set them to expand
 277     setOperationAction(ISD::SDIV, VT, Expand);
 278     setOperationAction(ISD::UDIV, VT, Expand);
 279     setOperationAction(ISD::SREM, VT, Expand);
 280     setOperationAction(ISD::UREM, VT, Expand);
 281
 282     // GPU does not have divrem function for signed or unsigned.
 283     setOperationAction(ISD::SDIVREM, VT, Custom);
 284     setOperationAction(ISD::UDIVREM, VT, Custom);
 285
 286     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
 287     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 288     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 289
 290     setOperationAction(ISD::BSWAP, VT, Expand);
 291     setOperationAction(ISD::CTTZ, VT, Expand);
 292     setOperationAction(ISD::CTLZ, VT, Expand);
 293   }
 294
 295   if (!Subtarget->hasBCNT(32))
 296     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
 297
 298   if (!Subtarget->hasBCNT(64))
 299     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
 300
 301   // The hardware supports 32-bit ROTR, but not ROTL.
 302   setOperationAction(ISD::ROTL, MVT::i32, Expand);
 303   setOperationAction(ISD::ROTL, MVT::i64, Expand);
 304   setOperationAction(ISD::ROTR, MVT::i64, Expand);
 305
 306   setOperationAction(ISD::MUL, MVT::i64, Expand);
 307   setOperationAction(ISD::MULHU, MVT::i64, Expand);
 308   setOperationAction(ISD::MULHS, MVT::i64, Expand);
 309   setOperationAction(ISD::UDIV, MVT::i32, Expand);
 310   setOperationAction(ISD::UREM, MVT::i32, Expand);
 311   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 312   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 313   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 314   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 315   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 316
 317   setOperationAction(ISD::SMIN, MVT::i32, Legal);
 318   setOperationAction(ISD::UMIN, MVT::i32, Legal);
 319   setOperationAction(ISD::SMAX, MVT::i32, Legal);
 320   setOperationAction(ISD::UMAX, MVT::i32, Legal);
 321
 322   if (Subtarget->hasFFBH())
 323     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
 324
 325   if (Subtarget->hasFFBL())
 326     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
 327
 328   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
 329   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 330
 331   // We only really have 32-bit BFE instructions (and 16-bit on VI).
 332   //
 333   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
 334   // effort to match them now. We want this to be false for i64 cases when the
 335   // extraction isn't restricted to the upper or lower half. Ideally we would
 336   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
 337   // span the midpoint are probably relatively rare, so don't worry about them
 338   // for now.
 339   if (Subtarget->hasBFE())
 340     setHasExtractBitsInsn(true);
 341
 342   static const MVT::SimpleValueType VectorIntTypes[] = {
 343     MVT::v2i32, MVT::v4i32
 344   };
 345
 346   for (MVT VT : VectorIntTypes) {
 347     // Expand the following operations for the current type by default.
 348     setOperationAction(ISD::ADD,  VT, Expand);
 349     setOperationAction(ISD::AND,  VT, Expand);
 350     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 351     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 352     setOperationAction(ISD::MUL,  VT, Expand);
 353     setOperationAction(ISD::MULHU, VT, Expand);
 354     setOperationAction(ISD::MULHS, VT, Expand);
 355     setOperationAction(ISD::OR,   VT, Expand);
 356     setOperationAction(ISD::SHL,  VT, Expand);
 357     setOperationAction(ISD::SRA,  VT, Expand);
 358     setOperationAction(ISD::SRL,  VT, Expand);
 359     setOperationAction(ISD::ROTL, VT, Expand);
 360     setOperationAction(ISD::ROTR, VT, Expand);
 361     setOperationAction(ISD::SUB,  VT, Expand);
 362     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 363     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 364     setOperationAction(ISD::SDIV, VT, Expand);
 365     setOperationAction(ISD::UDIV, VT, Expand);
 366     setOperationAction(ISD::SREM, VT, Expand);
 367     setOperationAction(ISD::UREM, VT, Expand);
 368     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 369     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 370     setOperationAction(ISD::SDIVREM, VT, Custom);
 371     setOperationAction(ISD::UDIVREM, VT, Expand);
 372     setOperationAction(ISD::ADDC, VT, Expand);
 373     setOperationAction(ISD::SUBC, VT, Expand);
 374     setOperationAction(ISD::ADDE, VT, Expand);
 375     setOperationAction(ISD::SUBE, VT, Expand);
 376     setOperationAction(ISD::SELECT, VT, Expand);
 377     setOperationAction(ISD::VSELECT, VT, Expand);
 378     setOperationAction(ISD::SELECT_CC, VT, Expand);
 379     setOperationAction(ISD::XOR,  VT, Expand);
 380     setOperationAction(ISD::BSWAP, VT, Expand);
 381     setOperationAction(ISD::CTPOP, VT, Expand);
 382     setOperationAction(ISD::CTTZ, VT, Expand);
 383     setOperationAction(ISD::CTLZ, VT, Expand);
 384     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 385   }
 386
 387   static const MVT::SimpleValueType FloatVectorTypes[] = {
 388     MVT::v2f32, MVT::v4f32
 389   };
 390
 391   for (MVT VT : FloatVectorTypes) {
 392     setOperationAction(ISD::FABS, VT, Expand);
 393     setOperationAction(ISD::FMINNUM, VT, Expand);
 394     setOperationAction(ISD::FMAXNUM, VT, Expand);
 395     setOperationAction(ISD::FADD, VT, Expand);
 396     setOperationAction(ISD::FCEIL, VT, Expand);
 397     setOperationAction(ISD::FCOS, VT, Expand);
 398     setOperationAction(ISD::FDIV, VT, Expand);
 399     setOperationAction(ISD::FEXP2, VT, Expand);
 400     setOperationAction(ISD::FLOG2, VT, Expand);
 401     setOperationAction(ISD::FREM, VT, Expand);
 402     setOperationAction(ISD::FPOW, VT, Expand);
 403     setOperationAction(ISD::FFLOOR, VT, Expand);
 404     setOperationAction(ISD::FTRUNC, VT, Expand);
 405     setOperationAction(ISD::FMUL, VT, Expand);
 406     setOperationAction(ISD::FMA, VT, Expand);
 407     setOperationAction(ISD::FRINT, VT, Expand);
 408     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 409     setOperationAction(ISD::FSQRT, VT, Expand);
 410     setOperationAction(ISD::FSIN, VT, Expand);
 411     setOperationAction(ISD::FSUB, VT, Expand);
 412     setOperationAction(ISD::FNEG, VT, Expand);
 413     setOperationAction(ISD::VSELECT, VT, Expand);
 414     setOperationAction(ISD::SELECT_CC, VT, Expand);
 415     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 416     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 417   }
 418
 419   // This causes using an unrolled select operation rather than expansion with
 420   // bit operations. This is in general better, but the alternative using BFI
 421   // instructions may be better if the select sources are SGPRs.
 422   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
 423   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
 424
 425   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
 426   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
 427
 428   // There are no libcalls of any kind.
 429   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
 430     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
 431
 432   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 433   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 434
 435   setSchedulingPreference(Sched::RegPressure);
 436   setJumpIsExpensive(true);
 437
 438   // FIXME: This is only partially true. If we have to do vector compares, any
 439   // SGPR pair can be a condition register. If we have a uniform condition, we
 440   // are better off doing SALU operations, where there is only one SCC. For now,
 441   // we don't have a way of knowing during instruction selection if a condition
 442   // will be uniform and we always use vector compares. Assume we are using
 443   // vector compares until that is fixed.
 444   setHasMultipleConditionRegisters(true);
 445
 446   // SI at least has hardware support for floating point exceptions, but no way
 447   // of using or handling them is implemented. They are also optional in OpenCL
 448   // (Section 7.3)
 449   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
 450
 451   PredictableSelectIsExpensive = false;
 452
 453   // We want to find all load dependencies for long chains of stores to enable
 454   // merging into very wide vectors. The problem is with vectors with > 4
 455   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
 456   // vectors are a legal type, even though we have to split the loads
 457   // usually. When we can more precisely specify load legality per address
 458   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
 459   // smarter so that they can figure out what to do in 2 iterations without all
 460   // N > 4 stores on the same chain.
 461   GatherAllAliasesMaxDepth = 16;
 462
 463   // FIXME: Need to really handle these.
 464   MaxStoresPerMemcpy  = 4096;
 465   MaxStoresPerMemmove = 4096;
 466   MaxStoresPerMemset  = 4096;
 467
 468   setTargetDAGCombine(ISD::BITCAST);
 469   setTargetDAGCombine(ISD::SHL);
 470   setTargetDAGCombine(ISD::SRA);
 471   setTargetDAGCombine(ISD::SRL);
 472   setTargetDAGCombine(ISD::MUL);
 473   setTargetDAGCombine(ISD::MULHU);
 474   setTargetDAGCombine(ISD::MULHS);
 475   setTargetDAGCombine(ISD::SELECT);
 476   setTargetDAGCombine(ISD::SELECT_CC);
 477   setTargetDAGCombine(ISD::STORE);
 478   setTargetDAGCombine(ISD::FADD);
 479   setTargetDAGCombine(ISD::FSUB);
 480   setTargetDAGCombine(ISD::FNEG);
 481 }
 482
 483 //===----------------------------------------------------------------------===//
 484 // Target Information
 485 //===----------------------------------------------------------------------===//
 486
 487 static bool fnegFoldsIntoOp(unsigned Opc) {
 488   switch (Opc) {
 489   case ISD::FADD:
 490   case ISD::FSUB:
 491   case ISD::FMUL:
 492   case ISD::FMA:
 493   case ISD::FMAD:
 494   case ISD::FSIN:
 495   case AMDGPUISD::RCP:
 496   case AMDGPUISD::RCP_LEGACY:
 497   case AMDGPUISD::SIN_HW:
 498   case AMDGPUISD::FMUL_LEGACY:
 499     return true;
 500   default:
 501     return false;
 502   }
 503 }
 504
 505 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
 506   return MVT::i32;
 507 }
 508
 509 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
 510   return true;
 511 }
 512
 513 // The backend supports 32 and 64 bit floating point immediates.
 514 // FIXME: Why are we reporting vectors of FP immediates as legal?
 515 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 516   EVT ScalarVT = VT.getScalarType();
 517   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
 518          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
 519 }
 520
 521 // We don't want to shrink f64 / f32 constants.
 522 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
 523   EVT ScalarVT = VT.getScalarType();
 524   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
 525 }
 526
 527 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
 528                                                  ISD::LoadExtType,
 529                                                  EVT NewVT) const {
 530
 531   unsigned NewSize = NewVT.getStoreSizeInBits();
 532
 533   // If we are reducing to a 32-bit load, this is always better.
 534   if (NewSize == 32)
 535     return true;
 536
 537   EVT OldVT = N->getValueType(0);
 538   unsigned OldSize = OldVT.getStoreSizeInBits();
 539
 540   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
 541   // extloads, so doing one requires using a buffer_load. In cases where we
 542   // still couldn't use a scalar load, using the wider load shouldn't really
 543   // hurt anything.
 544
 545   // If the old size already had to be an extload, there's no harm in continuing
 546   // to reduce the width.
 547   return (OldSize < 32);
 548 }
 549
 550 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
 551                                                    EVT CastTy) const {
 552
 553   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
 554
 555   if (LoadTy.getScalarType() == MVT::i32)
 556     return false;
 557
 558   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
 559   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
 560
 561   return (LScalarSize < CastScalarSize) ||
 562          (CastScalarSize >= 32);
 563 }
 564
 565 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
 566 // profitable with the expansion for 64-bit since it's generally good to
 567 // speculate things.
 568 // FIXME: These should really have the size as a parameter.
 569 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
 570   return true;
 571 }
 572
 573 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
 574   return true;
 575 }
 576
 577 //===---------------------------------------------------------------------===//
 578 // Target Properties
 579 //===---------------------------------------------------------------------===//
 580
 581 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
 582   assert(VT.isFloatingPoint());
 583   return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
 584                                               VT == MVT::f16);
 585 }
 586
 587 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
 588   return isFAbsFree(VT);
 589 }
 590
 591 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
 592                                                          unsigned NumElem,
 593                                                          unsigned AS) const {
 594   return true;
 595 }
 596
 597 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
 598   // There are few operations which truly have vector input operands. Any vector
 599   // operation is going to involve operations on each component, and a
 600   // build_vector will be a copy per element, so it always makes sense to use a
 601   // build_vector input in place of the extracted element to avoid a copy into a
 602   // super register.
 603   //
 604   // We should probably only do this if all users are extracts only, but this
 605   // should be the common case.
 606   return true;
 607 }
 608
 609 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
 610   // Truncate is just accessing a subregister.
 611
 612   unsigned SrcSize = Source.getSizeInBits();
 613   unsigned DestSize = Dest.getSizeInBits();
 614
 615   return DestSize < SrcSize && DestSize % 32 == 0 ;
 616 }
 617
 618 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
 619   // Truncate is just accessing a subregister.
 620
 621   unsigned SrcSize = Source->getScalarSizeInBits();
 622   unsigned DestSize = Dest->getScalarSizeInBits();
 623
 624   if (DestSize== 16 && Subtarget->has16BitInsts())
 625     return SrcSize >= 32;
 626
 627   return DestSize < SrcSize && DestSize % 32 == 0;
 628 }
 629
 630 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
 631   unsigned SrcSize = Src->getScalarSizeInBits();
 632   unsigned DestSize = Dest->getScalarSizeInBits();
 633
 634   if (SrcSize == 16 && Subtarget->has16BitInsts())
 635     return DestSize >= 32;
 636
 637   return SrcSize == 32 && DestSize == 64;
 638 }
 639
 640 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
 641   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
 642   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
 643   // this will enable reducing 64-bit operations the 32-bit, which is always
 644   // good.
 645
 646   if (Src == MVT::i16)
 647     return Dest == MVT::i32 ||Dest == MVT::i64 ;
 648
 649   return Src == MVT::i32 && Dest == MVT::i64;
 650 }
 651
 652 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
 653   return isZExtFree(Val.getValueType(), VT2);
 654 }
 655
 656 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
 657   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
 658   // limited number of native 64-bit operations. Shrinking an operation to fit
 659   // in a single 32-bit register should always be helpful. As currently used,
 660   // this is much less general than the name suggests, and is only used in
 661   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
 662   // not profitable, and may actually be harmful.
 663   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
 664 }
 665
 666 //===---------------------------------------------------------------------===//
 667 // TargetLowering Callbacks
 668 //===---------------------------------------------------------------------===//
 669
 670 /// The SelectionDAGBuilder will automatically promote function arguments
 671 /// with illegal types.  However, this does not work for the AMDGPU targets
 672 /// since the function arguments are stored in memory as these illegal types.
 673 /// In order to handle this properly we need to get the original types sizes
 674 /// from the LLVM IR Function and fixup the ISD:InputArg values before
 675 /// passing them to AnalyzeFormalArguments()
 676
 677 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
 678 /// input values across multiple registers.  Each item in the Ins array
 679 /// represents a single value that will be stored in regsters.  Ins[x].VT is
 680 /// the value type of the value that will be stored in the register, so
 681 /// whatever SDNode we lower the argument to needs to be this type.
 682 ///
 683 /// In order to correctly lower the arguments we need to know the size of each
 684 /// argument.  Since Ins[x].VT gives us the size of the register that will
 685 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
 686 /// for the orignal function argument so that we can deduce the correct memory
 687 /// type to use for Ins[x].  In most cases the correct memory type will be
 688 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
 689 /// we have a kernel argument of type v8i8, this argument will be split into
 690 /// 8 parts and each part will be represented by its own item in the Ins array.
 691 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
 692 /// the argument before it was split.  From this, we deduce that the memory type
 693 /// for each individual part is i8.  We pass the memory type as LocVT to the
 694 /// calling convention analysis function and the register type (Ins[x].VT) as
 695 /// the ValVT.
 696 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
 697                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
 698   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
 699     const ISD::InputArg &In = Ins[i];
 700     EVT MemVT;
 701
 702     unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
 703
 704     if (!Subtarget->isAmdHsaOS() &&
 705         (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
 706       // The ABI says the caller will extend these values to 32-bits.
 707       MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
 708     } else if (NumRegs == 1) {
 709       // This argument is not split, so the IR type is the memory type.
 710       assert(!In.Flags.isSplit());
 711       if (In.ArgVT.isExtended()) {
 712         // We have an extended type, like i24, so we should just use the register type
 713         MemVT = In.VT;
 714       } else {
 715         MemVT = In.ArgVT;
 716       }
 717     } else if (In.ArgVT.isVector() && In.VT.isVector() &&
 718                In.ArgVT.getScalarType() == In.VT.getScalarType()) {
 719       assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
 720       // We have a vector value which has been split into a vector with
 721       // the same scalar type, but fewer elements.  This should handle
 722       // all the floating-point vector types.
 723       MemVT = In.VT;
 724     } else if (In.ArgVT.isVector() &&
 725                In.ArgVT.getVectorNumElements() == NumRegs) {
 726       // This arg has been split so that each element is stored in a separate
 727       // register.
 728       MemVT = In.ArgVT.getScalarType();
 729     } else if (In.ArgVT.isExtended()) {
 730       // We have an extended type, like i65.
 731       MemVT = In.VT;
 732     } else {
 733       unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
 734       assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
 735       if (In.VT.isInteger()) {
 736         MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
 737       } else if (In.VT.isVector()) {
 738         assert(!In.VT.getScalarType().isFloatingPoint());
 739         unsigned NumElements = In.VT.getVectorNumElements();
 740         assert(MemoryBits % NumElements == 0);
 741         // This vector type has been split into another vector type with
 742         // a different elements size.
 743         EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
 744                                          MemoryBits / NumElements);
 745         MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
 746       } else {
 747         llvm_unreachable("cannot deduce memory type.");
 748       }
 749     }
 750
 751     // Convert one element vectors to scalar.
 752     if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
 753       MemVT = MemVT.getScalarType();
 754
 755     if (MemVT.isExtended()) {
 756       // This should really only happen if we have vec3 arguments
 757       assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
 758       MemVT = MemVT.getPow2VectorType(State.getContext());
 759     }
 760
 761     assert(MemVT.isSimple());
 762     allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
 763                     State);
 764   }
 765 }
 766
 767 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
 768                               const SmallVectorImpl<ISD::InputArg> &Ins) const {
 769   State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
 770 }
 771
 772 void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
 773                            const SmallVectorImpl<ISD::OutputArg> &Outs) const {
 774
 775   State.AnalyzeReturn(Outs, RetCC_SI);
 776 }
 777
 778 SDValue
 779 AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 780                                   bool isVarArg,
 781                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
 782                                   const SmallVectorImpl<SDValue> &OutVals,
 783                                   const SDLoc &DL, SelectionDAG &DAG) const {
 784   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
 785 }
 786
 787 //===---------------------------------------------------------------------===//
 788 // Target specific lowering
 789 //===---------------------------------------------------------------------===//
 790
 791 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
 792                                         SmallVectorImpl<SDValue> &InVals) const {
 793   SDValue Callee = CLI.Callee;
 794   SelectionDAG &DAG = CLI.DAG;
 795
 796   const Function &Fn = *DAG.getMachineFunction().getFunction();
 797
 798   StringRef FuncName("<unknown>");
 799
 800   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
 801     FuncName = G->getSymbol();
 802   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
 803     FuncName = G->getGlobal()->getName();
 804
 805   DiagnosticInfoUnsupported NoCalls(
 806       Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
 807   DAG.getContext()->diagnose(NoCalls);
 808
 809   if (!CLI.IsTailCall) {
 810     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
 811       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
 812   }
 813
 814   return DAG.getEntryNode();
 815 }
 816
 817 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 818                                                       SelectionDAG &DAG) const {
 819   const Function &Fn = *DAG.getMachineFunction().getFunction();
 820
 821   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
 822                                             SDLoc(Op).getDebugLoc());
 823   DAG.getContext()->diagnose(NoDynamicAlloca);
 824   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
 825   return DAG.getMergeValues(Ops, SDLoc());
 826 }
 827
 828 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
 829                                              SelectionDAG &DAG) const {
 830   switch (Op.getOpcode()) {
 831   default:
 832     Op->dump(&DAG);
 833     llvm_unreachable("Custom lowering code for this"
 834                      "instruction is not implemented yet!");
 835     break;
 836   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
 837   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
 838   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
 839   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
 840   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
 841   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
 842   case ISD::FREM: return LowerFREM(Op, DAG);
 843   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
 844   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
 845   case ISD::FRINT: return LowerFRINT(Op, DAG);
 846   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
 847   case ISD::FROUND: return LowerFROUND(Op, DAG);
 848   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
 849   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
 850   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
 851   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
 852   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
 853   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
 854   case ISD::CTLZ:
 855   case ISD::CTLZ_ZERO_UNDEF:
 856     return LowerCTLZ(Op, DAG);
 857   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
 858   }
 859   return Op;
 860 }
 861
 862 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
 863                                               SmallVectorImpl<SDValue> &Results,
 864                                               SelectionDAG &DAG) const {
 865   switch (N->getOpcode()) {
 866   case ISD::SIGN_EXTEND_INREG:
 867     // Different parts of legalization seem to interpret which type of
 868     // sign_extend_inreg is the one to check for custom lowering. The extended
 869     // from type is what really matters, but some places check for custom
 870     // lowering of the result type. This results in trying to use
 871     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
 872     // nothing here and let the illegal result integer be handled normally.
 873     return;
 874   default:
 875     return;
 876   }
 877 }
 878
 879 static bool hasDefinedInitializer(const GlobalValue *GV) {
 880   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
 881   if (!GVar || !GVar->hasInitializer())
 882     return false;
 883
 884   return !isa<UndefValue>(GVar->getInitializer());
 885 }
 886
 887 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 888                                                  SDValue Op,
 889                                                  SelectionDAG &DAG) const {
 890
 891   const DataLayout &DL = DAG.getDataLayout();
 892   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
 893   const GlobalValue *GV = G->getGlobal();
 894
 895   switch (G->getAddressSpace()) {
 896   case AMDGPUAS::LOCAL_ADDRESS: {
 897     // XXX: What does the value of G->getOffset() mean?
 898     assert(G->getOffset() == 0 &&
 899          "Do not know what to do with an non-zero offset");
 900
 901     // TODO: We could emit code to handle the initialization somewhere.
 902     if (hasDefinedInitializer(GV))
 903       break;
 904
 905     unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
 906     return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
 907   }
 908   }
 909
 910   const Function &Fn = *DAG.getMachineFunction().getFunction();
 911   DiagnosticInfoUnsupported BadInit(
 912       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
 913   DAG.getContext()->diagnose(BadInit);
 914   return SDValue();
 915 }
 916
 917 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
 918                                                   SelectionDAG &DAG) const {
 919   SmallVector<SDValue, 8> Args;
 920
 921   for (const SDUse &U : Op->ops())
 922     DAG.ExtractVectorElements(U.get(), Args);
 923
 924   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 925 }
 926
 927 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
 928                                                      SelectionDAG &DAG) const {
 929
 930   SmallVector<SDValue, 8> Args;
 931   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 932   EVT VT = Op.getValueType();
 933   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
 934                             VT.getVectorNumElements());
 935
 936   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 937 }
 938
 939 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 940     SelectionDAG &DAG) const {
 941   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 942   SDLoc DL(Op);
 943   EVT VT = Op.getValueType();
 944
 945   switch (IntrinsicID) {
 946     default: return Op;
 947     case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
 948       return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
 949                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
 950
 951     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
 952       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
 953                          Op.getOperand(1),
 954                          Op.getOperand(2),
 955                          Op.getOperand(3));
 956
 957     case AMDGPUIntrinsic::AMDGPU_bfe_u32:
 958       return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
 959                          Op.getOperand(1),
 960                          Op.getOperand(2),
 961                          Op.getOperand(3));
 962   }
 963 }
 964
 965 /// \brief Generate Min/Max node
 966 SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
 967                                                    SDValue LHS, SDValue RHS,
 968                                                    SDValue True, SDValue False,
 969                                                    SDValue CC,
 970                                                    DAGCombinerInfo &DCI) const {
 971   if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 972     return SDValue();
 973
 974   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
 975     return SDValue();
 976
 977   SelectionDAG &DAG = DCI.DAG;
 978   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 979   switch (CCOpcode) {
 980   case ISD::SETOEQ:
 981   case ISD::SETONE:
 982   case ISD::SETUNE:
 983   case ISD::SETNE:
 984   case ISD::SETUEQ:
 985   case ISD::SETEQ:
 986   case ISD::SETFALSE:
 987   case ISD::SETFALSE2:
 988   case ISD::SETTRUE:
 989   case ISD::SETTRUE2:
 990   case ISD::SETUO:
 991   case ISD::SETO:
 992     break;
 993   case ISD::SETULE:
 994   case ISD::SETULT: {
 995     if (LHS == True)
 996       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
 997     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
 998   }
 999   case ISD::SETOLE:
1000   case ISD::SETOLT:
1001   case ISD::SETLE:
1002   case ISD::SETLT: {
1003     // Ordered. Assume ordered for undefined.
1004
1005     // Only do this after legalization to avoid interfering with other combines
1006     // which might occur.
1007     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1008         !DCI.isCalledByLegalizer())
1009       return SDValue();
1010
1011     // We need to permute the operands to get the correct NaN behavior. The
1012     // selected operand is the second one based on the failing compare with NaN,
1013     // so permute it based on the compare type the hardware uses.
1014     if (LHS == True)
1015       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1016     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1017   }
1018   case ISD::SETUGE:
1019   case ISD::SETUGT: {
1020     if (LHS == True)
1021       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1022     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1023   }
1024   case ISD::SETGT:
1025   case ISD::SETGE:
1026   case ISD::SETOGE:
1027   case ISD::SETOGT: {
1028     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1029         !DCI.isCalledByLegalizer())
1030       return SDValue();
1031
1032     if (LHS == True)
1033       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1034     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1035   }
1036   case ISD::SETCC_INVALID:
1037     llvm_unreachable("Invalid setcc condcode!");
1038   }
1039   return SDValue();
1040 }
1041
1042 std::pair<SDValue, SDValue>
1043 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1044   SDLoc SL(Op);
1045
1046   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1047
1048   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1049   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1050
1051   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1052   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1053
1054   return std::make_pair(Lo, Hi);
1055 }
1056
1057 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1058   SDLoc SL(Op);
1059
1060   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1061   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1062   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1063 }
1064
1065 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1066   SDLoc SL(Op);
1067
1068   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1069   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1070   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1071 }
1072
1073 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1074                                               SelectionDAG &DAG) const {
1075   LoadSDNode *Load = cast<LoadSDNode>(Op);
1076   EVT VT = Op.getValueType();
1077
1078
1079   // If this is a 2 element vector, we really want to scalarize and not create
1080   // weird 1 element vectors.
1081   if (VT.getVectorNumElements() == 2)
1082     return scalarizeVectorLoad(Load, DAG);
1083
1084   SDValue BasePtr = Load->getBasePtr();
1085   EVT PtrVT = BasePtr.getValueType();
1086   EVT MemVT = Load->getMemoryVT();
1087   SDLoc SL(Op);
1088
1089   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1090
1091   EVT LoVT, HiVT;
1092   EVT LoMemVT, HiMemVT;
1093   SDValue Lo, Hi;
1094
1095   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1096   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1097   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
1098
1099   unsigned Size = LoMemVT.getStoreSize();
1100   unsigned BaseAlign = Load->getAlignment();
1101   unsigned HiAlign = MinAlign(BaseAlign, Size);
1102
1103   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1104                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1105                                   BaseAlign, Load->getMemOperand()->getFlags());
1106   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1107                               DAG.getConstant(Size, SL, PtrVT));
1108   SDValue HiLoad =
1109       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1110                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1111                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1112
1113   SDValue Ops[] = {
1114     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
1115     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1116                 LoLoad.getValue(1), HiLoad.getValue(1))
1117   };
1118
1119   return DAG.getMergeValues(Ops, SL);
1120 }
1121
1122 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1123                                                SelectionDAG &DAG) const {
1124   StoreSDNode *Store = cast<StoreSDNode>(Op);
1125   SDValue Val = Store->getValue();
1126   EVT VT = Val.getValueType();
1127
1128   // If this is a 2 element vector, we really want to scalarize and not create
1129   // weird 1 element vectors.
1130   if (VT.getVectorNumElements() == 2)
1131     return scalarizeVectorStore(Store, DAG);
1132
1133   EVT MemVT = Store->getMemoryVT();
1134   SDValue Chain = Store->getChain();
1135   SDValue BasePtr = Store->getBasePtr();
1136   SDLoc SL(Op);
1137
1138   EVT LoVT, HiVT;
1139   EVT LoMemVT, HiMemVT;
1140   SDValue Lo, Hi;
1141
1142   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1143   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1144   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
1145
1146   EVT PtrVT = BasePtr.getValueType();
1147   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1148                               DAG.getConstant(LoMemVT.getStoreSize(), SL,
1149                                               PtrVT));
1150
1151   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1152   unsigned BaseAlign = Store->getAlignment();
1153   unsigned Size = LoMemVT.getStoreSize();
1154   unsigned HiAlign = MinAlign(BaseAlign, Size);
1155
1156   SDValue LoStore =
1157       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1158                         Store->getMemOperand()->getFlags());
1159   SDValue HiStore =
1160       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1161                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1162
1163   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1164 }
1165
1166 // This is a shortcut for integer division because we have fast i32<->f32
1167 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1168 // float is enough to accurately represent up to a 24-bit signed integer.
1169 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1170                                             bool Sign) const {
1171   SDLoc DL(Op);
1172   EVT VT = Op.getValueType();
1173   SDValue LHS = Op.getOperand(0);
1174   SDValue RHS = Op.getOperand(1);
1175   MVT IntVT = MVT::i32;
1176   MVT FltVT = MVT::f32;
1177
1178   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1179   if (LHSSignBits < 9)
1180     return SDValue();
1181
1182   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1183   if (RHSSignBits < 9)
1184     return SDValue();
1185
1186   unsigned BitSize = VT.getSizeInBits();
1187   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1188   unsigned DivBits = BitSize - SignBits;
1189   if (Sign)
1190     ++DivBits;
1191
1192   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1193   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1194
1195   SDValue jq = DAG.getConstant(1, DL, IntVT);
1196
1197   if (Sign) {
1198     // char|short jq = ia ^ ib;
1199     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1200
1201     // jq = jq >> (bitsize - 2)
1202     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1203                      DAG.getConstant(BitSize - 2, DL, VT));
1204
1205     // jq = jq | 0x1
1206     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1207   }
1208
1209   // int ia = (int)LHS;
1210   SDValue ia = LHS;
1211
1212   // int ib, (int)RHS;
1213   SDValue ib = RHS;
1214
1215   // float fa = (float)ia;
1216   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1217
1218   // float fb = (float)ib;
1219   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1220
1221   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1222                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1223
1224   // fq = trunc(fq);
1225   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1226
1227   // float fqneg = -fq;
1228   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1229
1230   // float fr = mad(fqneg, fb, fa);
1231   SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
1232
1233   // int iq = (int)fq;
1234   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1235
1236   // fr = fabs(fr);
1237   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1238
1239   // fb = fabs(fb);
1240   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1241
1242   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1243
1244   // int cv = fr >= fb;
1245   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1246
1247   // jq = (cv ? jq : 0);
1248   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1249
1250   // dst = iq + jq;
1251   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1252
1253   // Rem needs compensation, it's easier to recompute it
1254   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1255   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1256
1257   // Truncate to number of bits this divide really is.
1258   if (Sign) {
1259     SDValue InRegSize
1260       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1261     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1262     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1263   } else {
1264     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1265     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1266     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1267   }
1268
1269   return DAG.getMergeValues({ Div, Rem }, DL);
1270 }
1271
1272 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1273                                       SelectionDAG &DAG,
1274                                       SmallVectorImpl<SDValue> &Results) const {
1275   assert(Op.getValueType() == MVT::i64);
1276
1277   SDLoc DL(Op);
1278   EVT VT = Op.getValueType();
1279   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1280
1281   SDValue one = DAG.getConstant(1, DL, HalfVT);
1282   SDValue zero = DAG.getConstant(0, DL, HalfVT);
1283
1284   //HiLo split
1285   SDValue LHS = Op.getOperand(0);
1286   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
1287   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
1288
1289   SDValue RHS = Op.getOperand(1);
1290   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
1291   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
1292
1293   if (VT == MVT::i64 &&
1294     DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1295     DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1296
1297     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1298                               LHS_Lo, RHS_Lo);
1299
1300     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
1301     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
1302
1303     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1304     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1305     return;
1306   }
1307
1308   // Get Speculative values
1309   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1310   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1311
1312   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
1313   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
1314   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1315
1316   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
1317   SDValue DIV_Lo = zero;
1318
1319   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1320
1321   for (unsigned i = 0; i < halfBitWidth; ++i) {
1322     const unsigned bitPos = halfBitWidth - i - 1;
1323     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1324     // Get value of high bit
1325     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1326     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
1327     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1328
1329     // Shift
1330     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1331     // Add LHS high bit
1332     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1333
1334     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1335     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
1336
1337     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1338
1339     // Update REM
1340     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1341     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1342   }
1343
1344   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1345   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1346   Results.push_back(DIV);
1347   Results.push_back(REM);
1348 }
1349
1350 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1351                                            SelectionDAG &DAG) const {
1352   SDLoc DL(Op);
1353   EVT VT = Op.getValueType();
1354
1355   if (VT == MVT::i64) {
1356     SmallVector<SDValue, 2> Results;
1357     LowerUDIVREM64(Op, DAG, Results);
1358     return DAG.getMergeValues(Results, DL);
1359   }
1360
1361   if (VT == MVT::i32) {
1362     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1363       return Res;
1364   }
1365
1366   SDValue Num = Op.getOperand(0);
1367   SDValue Den = Op.getOperand(1);
1368
1369   // RCP =  URECIP(Den) = 2^32 / Den + e
1370   // e is rounding error.
1371   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1372
1373   // RCP_LO = mul(RCP, Den) */
1374   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1375
1376   // RCP_HI = mulhu (RCP, Den) */
1377   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1378
1379   // NEG_RCP_LO = -RCP_LO
1380   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1381                                                      RCP_LO);
1382
1383   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1384   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1385                                            NEG_RCP_LO, RCP_LO,
1386                                            ISD::SETEQ);
1387   // Calculate the rounding error from the URECIP instruction
1388   // E = mulhu(ABS_RCP_LO, RCP)
1389   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1390
1391   // RCP_A_E = RCP + E
1392   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1393
1394   // RCP_S_E = RCP - E
1395   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1396
1397   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1398   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1399                                      RCP_A_E, RCP_S_E,
1400                                      ISD::SETEQ);
1401   // Quotient = mulhu(Tmp0, Num)
1402   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1403
1404   // Num_S_Remainder = Quotient * Den
1405   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1406
1407   // Remainder = Num - Num_S_Remainder
1408   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1409
1410   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1411   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1412                                                  DAG.getConstant(-1, DL, VT),
1413                                                  DAG.getConstant(0, DL, VT),
1414                                                  ISD::SETUGE);
1415   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1416   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1417                                                   Num_S_Remainder,
1418                                                   DAG.getConstant(-1, DL, VT),
1419                                                   DAG.getConstant(0, DL, VT),
1420                                                   ISD::SETUGE);
1421   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1422   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1423                                                Remainder_GE_Zero);
1424
1425   // Calculate Division result:
1426
1427   // Quotient_A_One = Quotient + 1
1428   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1429                                        DAG.getConstant(1, DL, VT));
1430
1431   // Quotient_S_One = Quotient - 1
1432   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1433                                        DAG.getConstant(1, DL, VT));
1434
1435   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1436   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1437                                      Quotient, Quotient_A_One, ISD::SETEQ);
1438
1439   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1440   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1441                             Quotient_S_One, Div, ISD::SETEQ);
1442
1443   // Calculate Rem result:
1444
1445   // Remainder_S_Den = Remainder - Den
1446   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1447
1448   // Remainder_A_Den = Remainder + Den
1449   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1450
1451   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1452   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1453                                     Remainder, Remainder_S_Den, ISD::SETEQ);
1454
1455   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1456   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1457                             Remainder_A_Den, Rem, ISD::SETEQ);
1458   SDValue Ops[2] = {
1459     Div,
1460     Rem
1461   };
1462   return DAG.getMergeValues(Ops, DL);
1463 }
1464
1465 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1466                                            SelectionDAG &DAG) const {
1467   SDLoc DL(Op);
1468   EVT VT = Op.getValueType();
1469
1470   SDValue LHS = Op.getOperand(0);
1471   SDValue RHS = Op.getOperand(1);
1472
1473   SDValue Zero = DAG.getConstant(0, DL, VT);
1474   SDValue NegOne = DAG.getConstant(-1, DL, VT);
1475
1476   if (VT == MVT::i32) {
1477     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1478       return Res;
1479   }
1480
1481   if (VT == MVT::i64 &&
1482       DAG.ComputeNumSignBits(LHS) > 32 &&
1483       DAG.ComputeNumSignBits(RHS) > 32) {
1484     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1485
1486     //HiLo split
1487     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1488     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1489     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1490                                  LHS_Lo, RHS_Lo);
1491     SDValue Res[2] = {
1492       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1493       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1494     };
1495     return DAG.getMergeValues(Res, DL);
1496   }
1497
1498   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1499   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1500   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1501   SDValue RSign = LHSign; // Remainder sign is the same as LHS
1502
1503   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1504   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1505
1506   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1507   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1508
1509   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1510   SDValue Rem = Div.getValue(1);
1511
1512   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1513   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1514
1515   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1516   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1517
1518   SDValue Res[2] = {
1519     Div,
1520     Rem
1521   };
1522   return DAG.getMergeValues(Res, DL);
1523 }
1524
1525 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1526 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
1527   SDLoc SL(Op);
1528   EVT VT = Op.getValueType();
1529   SDValue X = Op.getOperand(0);
1530   SDValue Y = Op.getOperand(1);
1531
1532   // TODO: Should this propagate fast-math-flags?
1533
1534   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
1535   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
1536   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
1537
1538   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
1539 }
1540
1541 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
1542   SDLoc SL(Op);
1543   SDValue Src = Op.getOperand(0);
1544
1545   // result = trunc(src)
1546   // if (src > 0.0 && src != result)
1547   //   result += 1.0
1548
1549   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1550
1551   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1552   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1553
1554   EVT SetCCVT =
1555       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1556
1557   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1558   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1559   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1560
1561   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1562   // TODO: Should this propagate fast-math-flags?
1563   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1564 }
1565
1566 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
1567                                   SelectionDAG &DAG) {
1568   const unsigned FractBits = 52;
1569   const unsigned ExpBits = 11;
1570
1571   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
1572                                 Hi,
1573                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
1574                                 DAG.getConstant(ExpBits, SL, MVT::i32));
1575   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1576                             DAG.getConstant(1023, SL, MVT::i32));
1577
1578   return Exp;
1579 }
1580
1581 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
1582   SDLoc SL(Op);
1583   SDValue Src = Op.getOperand(0);
1584
1585   assert(Op.getValueType() == MVT::f64);
1586
1587   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1588   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1589
1590   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1591
1592   // Extract the upper half, since this is where we will find the sign and
1593   // exponent.
1594   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1595
1596   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1597
1598   const unsigned FractBits = 52;
1599
1600   // Extract the sign bit.
1601   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
1602   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1603
1604   // Extend back to to 64-bits.
1605   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
1606   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1607
1608   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1609   const SDValue FractMask
1610     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
1611
1612   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1613   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1614   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1615
1616   EVT SetCCVT =
1617       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1618
1619   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
1620
1621   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1622   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1623
1624   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
1625   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
1626
1627   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
1628 }
1629
1630 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
1631   SDLoc SL(Op);
1632   SDValue Src = Op.getOperand(0);
1633
1634   assert(Op.getValueType() == MVT::f64);
1635
1636   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1637   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
1638   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
1639
1640   // TODO: Should this propagate fast-math-flags?
1641
1642   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
1643   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
1644
1645   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
1646
1647   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1648   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
1649
1650   EVT SetCCVT =
1651       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1652   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
1653
1654   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
1655 }
1656
1657 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
1658   // FNEARBYINT and FRINT are the same, except in their handling of FP
1659   // exceptions. Those aren't really meaningful for us, and OpenCL only has
1660   // rint, so just treat them as equivalent.
1661   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
1662 }
1663
1664 // XXX - May require not supporting f32 denormals?
1665 SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
1666   SDLoc SL(Op);
1667   SDValue X = Op.getOperand(0);
1668
1669   SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
1670
1671   // TODO: Should this propagate fast-math-flags?
1672
1673   SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
1674
1675   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
1676
1677   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
1678   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
1679   const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
1680
1681   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
1682
1683   EVT SetCCVT =
1684       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
1685
1686   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
1687
1688   SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
1689
1690   return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
1691 }
1692
1693 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
1694   SDLoc SL(Op);
1695   SDValue X = Op.getOperand(0);
1696
1697   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
1698
1699   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1700   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1701   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
1702   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
1703   EVT SetCCVT =
1704       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1705
1706   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
1707
1708   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
1709
1710   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1711
1712   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
1713                                        MVT::i64);
1714
1715   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
1716   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
1717                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
1718                                           MVT::i64),
1719                           Exp);
1720
1721   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
1722   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
1723                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
1724                               ISD::SETNE);
1725
1726   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
1727                              D, DAG.getConstant(0, SL, MVT::i64));
1728   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
1729
1730   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
1731   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
1732
1733   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1734   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1735   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
1736
1737   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
1738                             ExpEqNegOne,
1739                             DAG.getConstantFP(1.0, SL, MVT::f64),
1740                             DAG.getConstantFP(0.0, SL, MVT::f64));
1741
1742   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
1743
1744   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
1745   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
1746
1747   return K;
1748 }
1749
1750 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
1751   EVT VT = Op.getValueType();
1752
1753   if (VT == MVT::f32)
1754     return LowerFROUND32(Op, DAG);
1755
1756   if (VT == MVT::f64)
1757     return LowerFROUND64(Op, DAG);
1758
1759   llvm_unreachable("unhandled type");
1760 }
1761
1762 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
1763   SDLoc SL(Op);
1764   SDValue Src = Op.getOperand(0);
1765
1766   // result = trunc(src);
1767   // if (src < 0.0 && src != result)
1768   //   result += -1.0.
1769
1770   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1771
1772   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1773   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
1774
1775   EVT SetCCVT =
1776       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1777
1778   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
1779   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1780   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1781
1782   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
1783   // TODO: Should this propagate fast-math-flags?
1784   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1785 }
1786
1787 SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
1788   SDLoc SL(Op);
1789   SDValue Src = Op.getOperand(0);
1790   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
1791
1792   if (ZeroUndef && Src.getValueType() == MVT::i32)
1793     return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
1794
1795   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1796
1797   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1798   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1799
1800   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1801   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1802
1803   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1804                                    *DAG.getContext(), MVT::i32);
1805
1806   SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
1807
1808   SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
1809   SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
1810
1811   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
1812   SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
1813
1814   // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
1815   SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
1816
1817   if (!ZeroUndef) {
1818     // Test if the full 64-bit input is zero.
1819
1820     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
1821     // which we probably don't want.
1822     SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
1823     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
1824
1825     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
1826     // with the same cycles, otherwise it is slower.
1827     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
1828     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
1829
1830     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
1831
1832     // The instruction returns -1 for 0 input, but the defined intrinsic
1833     // behavior is to return the number of bits.
1834     NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
1835                           SrcIsZero, Bits32, NewCtlz);
1836   }
1837
1838   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
1839 }
1840
1841 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
1842                                                bool Signed) const {
1843   // Unsigned
1844   // cul2f(ulong u)
1845   //{
1846   //  uint lz = clz(u);
1847   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
1848   //  u = (u << lz) & 0x7fffffffffffffffUL;
1849   //  ulong t = u & 0xffffffffffUL;
1850   //  uint v = (e << 23) | (uint)(u >> 40);
1851   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
1852   //  return as_float(v + r);
1853   //}
1854   // Signed
1855   // cl2f(long l)
1856   //{
1857   //  long s = l >> 63;
1858   //  float r = cul2f((l + s) ^ s);
1859   //  return s ? -r : r;
1860   //}
1861
1862   SDLoc SL(Op);
1863   SDValue Src = Op.getOperand(0);
1864   SDValue L = Src;
1865
1866   SDValue S;
1867   if (Signed) {
1868     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
1869     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
1870
1871     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
1872     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
1873   }
1874
1875   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1876                                    *DAG.getContext(), MVT::f32);
1877
1878
1879   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
1880   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
1881   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
1882   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
1883
1884   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
1885   SDValue E = DAG.getSelect(SL, MVT::i32,
1886     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
1887     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
1888     ZeroI32);
1889
1890   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
1891     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
1892     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
1893
1894   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
1895                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
1896
1897   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
1898                              U, DAG.getConstant(40, SL, MVT::i64));
1899
1900   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
1901     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
1902     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
1903
1904   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
1905   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
1906   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
1907
1908   SDValue One = DAG.getConstant(1, SL, MVT::i32);
1909
1910   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
1911
1912   SDValue R = DAG.getSelect(SL, MVT::i32,
1913     RCmp,
1914     One,
1915     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
1916   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
1917   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
1918
1919   if (!Signed)
1920     return R;
1921
1922   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
1923   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
1924 }
1925
1926 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
1927                                                bool Signed) const {
1928   SDLoc SL(Op);
1929   SDValue Src = Op.getOperand(0);
1930
1931   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1932
1933   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
1934                            DAG.getConstant(0, SL, MVT::i32));
1935   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
1936                            DAG.getConstant(1, SL, MVT::i32));
1937
1938   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
1939                               SL, MVT::f64, Hi);
1940
1941   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
1942
1943   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
1944                               DAG.getConstant(32, SL, MVT::i32));
1945   // TODO: Should this propagate fast-math-flags?
1946   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
1947 }
1948
1949 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
1950                                                SelectionDAG &DAG) const {
1951   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
1952          "operation should be legal");
1953
1954   // TODO: Factor out code common with LowerSINT_TO_FP.
1955
1956   EVT DestVT = Op.getValueType();
1957   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
1958     SDLoc DL(Op);
1959     SDValue Src = Op.getOperand(0);
1960
1961     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
1962     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
1963     SDValue FPRound =
1964         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
1965
1966     return FPRound;
1967   }
1968
1969   if (DestVT == MVT::f32)
1970     return LowerINT_TO_FP32(Op, DAG, false);
1971
1972   assert(DestVT == MVT::f64);
1973   return LowerINT_TO_FP64(Op, DAG, false);
1974 }
1975
1976 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
1977                                               SelectionDAG &DAG) const {
1978   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
1979          "operation should be legal");
1980
1981   // TODO: Factor out code common with LowerUINT_TO_FP.
1982
1983   EVT DestVT = Op.getValueType();
1984   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
1985     SDLoc DL(Op);
1986     SDValue Src = Op.getOperand(0);
1987
1988     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
1989     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
1990     SDValue FPRound =
1991         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
1992
1993     return FPRound;
1994   }
1995
1996   if (DestVT == MVT::f32)
1997     return LowerINT_TO_FP32(Op, DAG, true);
1998
1999   assert(DestVT == MVT::f64);
2000   return LowerINT_TO_FP64(Op, DAG, true);
2001 }
2002
2003 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2004                                                bool Signed) const {
2005   SDLoc SL(Op);
2006
2007   SDValue Src = Op.getOperand(0);
2008
2009   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2010
2011   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2012                                  MVT::f64);
2013   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2014                                  MVT::f64);
2015   // TODO: Should this propagate fast-math-flags?
2016   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2017
2018   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2019
2020
2021   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2022
2023   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2024                            MVT::i32, FloorMul);
2025   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2026
2027   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2028
2029   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2030 }
2031
2032 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2033
2034   if (getTargetMachine().Options.UnsafeFPMath) {
2035     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2036     return SDValue();
2037   }
2038
2039   SDLoc DL(Op);
2040   SDValue N0 = Op.getOperand(0);
2041   assert (N0.getSimpleValueType() == MVT::f64);
2042
2043   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2044   const unsigned ExpMask = 0x7ff;
2045   const unsigned ExpBiasf64 = 1023;
2046   const unsigned ExpBiasf16 = 15;
2047   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2048   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2049   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2050   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2051                            DAG.getConstant(32, DL, MVT::i64));
2052   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2053   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2054   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2055                           DAG.getConstant(20, DL, MVT::i64));
2056   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2057                   DAG.getConstant(ExpMask, DL, MVT::i32));
2058   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2059   // add the f16 bias (15) to get the biased exponent for the f16 format.
2060   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2061                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2062
2063   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2064                           DAG.getConstant(8, DL, MVT::i32));
2065   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2066                   DAG.getConstant(0xffe, DL, MVT::i32));
2067
2068   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2069                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2070   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2071
2072   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2073   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2074
2075   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2076   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2077       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2078                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2079
2080   // N = M | (E << 12);
2081   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2082       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2083                   DAG.getConstant(12, DL, MVT::i32)));
2084
2085   // B = clamp(1-E, 0, 13);
2086   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2087                                   One, E);
2088   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2089   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2090                   DAG.getConstant(13, DL, MVT::i32));
2091
2092   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2093                                    DAG.getConstant(0x1000, DL, MVT::i32));
2094
2095   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2096   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2097   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2098   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2099
2100   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2101   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2102                               DAG.getConstant(0x7, DL, MVT::i32));
2103   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2104                   DAG.getConstant(2, DL, MVT::i32));
2105   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2106                                One, Zero, ISD::SETEQ);
2107   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2108                                One, Zero, ISD::SETGT);
2109   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2110   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2111
2112   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2113                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2114   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2115                       I, V, ISD::SETEQ);
2116
2117   // Extract the sign bit.
2118   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2119                             DAG.getConstant(16, DL, MVT::i32));
2120   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2121                      DAG.getConstant(0x8000, DL, MVT::i32));
2122
2123   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2124   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2125 }
2126
2127 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2128                                               SelectionDAG &DAG) const {
2129   SDValue Src = Op.getOperand(0);
2130
2131   // TODO: Factor out code common with LowerFP_TO_UINT.
2132
2133   EVT SrcVT = Src.getValueType();
2134   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2135     SDLoc DL(Op);
2136
2137     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2138     SDValue FpToInt32 =
2139         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2140
2141     return FpToInt32;
2142   }
2143
2144   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2145     return LowerFP64_TO_INT(Op, DAG, true);
2146
2147   return SDValue();
2148 }
2149
2150 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2151                                               SelectionDAG &DAG) const {
2152   SDValue Src = Op.getOperand(0);
2153
2154   // TODO: Factor out code common with LowerFP_TO_SINT.
2155
2156   EVT SrcVT = Src.getValueType();
2157   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2158     SDLoc DL(Op);
2159
2160     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2161     SDValue FpToInt32 =
2162         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2163
2164     return FpToInt32;
2165   }
2166
2167   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2168     return LowerFP64_TO_INT(Op, DAG, false);
2169
2170   return SDValue();
2171 }
2172
2173 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2174                                                      SelectionDAG &DAG) const {
2175   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2176   MVT VT = Op.getSimpleValueType();
2177   MVT ScalarVT = VT.getScalarType();
2178
2179   assert(VT.isVector());
2180
2181   SDValue Src = Op.getOperand(0);
2182   SDLoc DL(Op);
2183
2184   // TODO: Don't scalarize on Evergreen?
2185   unsigned NElts = VT.getVectorNumElements();
2186   SmallVector<SDValue, 8> Args;
2187   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2188
2189   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2190   for (unsigned I = 0; I < NElts; ++I)
2191     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2192
2193   return DAG.getBuildVector(VT, DL, Args);
2194 }
2195
2196 //===----------------------------------------------------------------------===//
2197 // Custom DAG optimizations
2198 //===----------------------------------------------------------------------===//
2199
2200 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2201   APInt KnownZero, KnownOne;
2202   EVT VT = Op.getValueType();
2203   DAG.computeKnownBits(Op, KnownZero, KnownOne);
2204
2205   return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
2206 }
2207
2208 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2209   EVT VT = Op.getValueType();
2210
2211   // In order for this to be a signed 24-bit value, bit 23, must
2212   // be a sign bit.
2213   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2214                                      // as unsigned 24-bit values.
2215          (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
2216 }
2217
2218 static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
2219                         TargetLowering::DAGCombinerInfo &DCI) {
2220
2221   SelectionDAG &DAG = DCI.DAG;
2222   SDValue Op = Node24->getOperand(OpIdx);
2223   EVT VT = Op.getValueType();
2224
2225   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
2226   APInt KnownZero, KnownOne;
2227   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
2228   if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
2229     return true;
2230
2231   return false;
2232 }
2233
2234 template <typename IntTy>
2235 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2236                                uint32_t Width, const SDLoc &DL) {
2237   if (Width + Offset < 32) {
2238     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2239     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2240     return DAG.getConstant(Result, DL, MVT::i32);
2241   }
2242
2243   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2244 }
2245
2246 static bool hasVolatileUser(SDNode *Val) {
2247   for (SDNode *U : Val->uses()) {
2248     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2249       if (M->isVolatile())
2250         return true;
2251     }
2252   }
2253
2254   return false;
2255 }
2256
2257 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2258   // i32 vectors are the canonical memory type.
2259   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2260     return false;
2261
2262   if (!VT.isByteSized())
2263     return false;
2264
2265   unsigned Size = VT.getStoreSize();
2266
2267   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2268     return false;
2269
2270   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2271     return false;
2272
2273   return true;
2274 }
2275
2276 // Replace load of an illegal type with a store of a bitcast to a friendlier
2277 // type.
2278 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2279                                                  DAGCombinerInfo &DCI) const {
2280   if (!DCI.isBeforeLegalize())
2281     return SDValue();
2282
2283   LoadSDNode *LN = cast<LoadSDNode>(N);
2284   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2285     return SDValue();
2286
2287   SDLoc SL(N);
2288   SelectionDAG &DAG = DCI.DAG;
2289   EVT VT = LN->getMemoryVT();
2290
2291   unsigned Size = VT.getStoreSize();
2292   unsigned Align = LN->getAlignment();
2293   if (Align < Size && isTypeLegal(VT)) {
2294     bool IsFast;
2295     unsigned AS = LN->getAddressSpace();
2296
2297     // Expand unaligned loads earlier than legalization. Due to visitation order
2298     // problems during legalization, the emitted instructions to pack and unpack
2299     // the bytes again are not eliminated in the case of an unaligned copy.
2300     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2301       if (VT.isVector())
2302         return scalarizeVectorLoad(LN, DAG);
2303
2304       SDValue Ops[2];
2305       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2306       return DAG.getMergeValues(Ops, SDLoc(N));
2307     }
2308
2309     if (!IsFast)
2310       return SDValue();
2311   }
2312
2313   if (!shouldCombineMemoryType(VT))
2314     return SDValue();
2315
2316   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2317
2318   SDValue NewLoad
2319     = DAG.getLoad(NewVT, SL, LN->getChain(),
2320                   LN->getBasePtr(), LN->getMemOperand());
2321
2322   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2323   DCI.CombineTo(N, BC, NewLoad.getValue(1));
2324   return SDValue(N, 0);
2325 }
2326
2327 // Replace store of an illegal type with a store of a bitcast to a friendlier
2328 // type.
2329 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2330                                                   DAGCombinerInfo &DCI) const {
2331   if (!DCI.isBeforeLegalize())
2332     return SDValue();
2333
2334   StoreSDNode *SN = cast<StoreSDNode>(N);
2335   if (SN->isVolatile() || !ISD::isNormalStore(SN))
2336     return SDValue();
2337
2338   EVT VT = SN->getMemoryVT();
2339   unsigned Size = VT.getStoreSize();
2340
2341   SDLoc SL(N);
2342   SelectionDAG &DAG = DCI.DAG;
2343   unsigned Align = SN->getAlignment();
2344   if (Align < Size && isTypeLegal(VT)) {
2345     bool IsFast;
2346     unsigned AS = SN->getAddressSpace();
2347
2348     // Expand unaligned stores earlier than legalization. Due to visitation
2349     // order problems during legalization, the emitted instructions to pack and
2350     // unpack the bytes again are not eliminated in the case of an unaligned
2351     // copy.
2352     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2353       if (VT.isVector())
2354         return scalarizeVectorStore(SN, DAG);
2355
2356       return expandUnalignedStore(SN, DAG);
2357     }
2358
2359     if (!IsFast)
2360       return SDValue();
2361   }
2362
2363   if (!shouldCombineMemoryType(VT))
2364     return SDValue();
2365
2366   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2367   SDValue Val = SN->getValue();
2368
2369   //DCI.AddToWorklist(Val.getNode());
2370
2371   bool OtherUses = !Val.hasOneUse();
2372   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2373   if (OtherUses) {
2374     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2375     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2376   }
2377
2378   return DAG.getStore(SN->getChain(), SL, CastVal,
2379                       SN->getBasePtr(), SN->getMemOperand());
2380 }
2381
2382 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2383 /// binary operation \p Opc to it with the corresponding constant operands.
2384 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2385   DAGCombinerInfo &DCI, const SDLoc &SL,
2386   unsigned Opc, SDValue LHS,
2387   uint32_t ValLo, uint32_t ValHi) const {
2388   SelectionDAG &DAG = DCI.DAG;
2389   SDValue Lo, Hi;
2390   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2391
2392   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2393   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
2394
2395   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
2396   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
2397
2398   // Re-visit the ands. It's possible we eliminated one of them and it could
2399   // simplify the vector.
2400   DCI.AddToWorklist(Lo.getNode());
2401   DCI.AddToWorklist(Hi.getNode());
2402
2403   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2404   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2405 }
2406
2407 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
2408                                                 DAGCombinerInfo &DCI) const {
2409   if (N->getValueType(0) != MVT::i64)
2410     return SDValue();
2411
2412   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2413
2414   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
2415   // common case, splitting this into a move and a 32-bit shift is faster and
2416   // the same code size.
2417   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2418   if (!RHS)
2419     return SDValue();
2420
2421   unsigned RHSVal = RHS->getZExtValue();
2422   if (RHSVal < 32)
2423     return SDValue();
2424
2425   SDValue LHS = N->getOperand(0);
2426
2427   SDLoc SL(N);
2428   SelectionDAG &DAG = DCI.DAG;
2429
2430   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
2431
2432   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
2433   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
2434
2435   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2436
2437   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
2438   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2439 }
2440
2441 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
2442                                                 DAGCombinerInfo &DCI) const {
2443   if (N->getValueType(0) != MVT::i64)
2444     return SDValue();
2445
2446   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2447   if (!RHS)
2448     return SDValue();
2449
2450   SelectionDAG &DAG = DCI.DAG;
2451   SDLoc SL(N);
2452   unsigned RHSVal = RHS->getZExtValue();
2453
2454   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
2455   if (RHSVal == 32) {
2456     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2457     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2458                                    DAG.getConstant(31, SL, MVT::i32));
2459
2460     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
2461     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2462   }
2463
2464   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
2465   if (RHSVal == 63) {
2466     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2467     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2468                                    DAG.getConstant(31, SL, MVT::i32));
2469     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
2470     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2471   }
2472
2473   return SDValue();
2474 }
2475
2476 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
2477                                                 DAGCombinerInfo &DCI) const {
2478   if (N->getValueType(0) != MVT::i64)
2479     return SDValue();
2480
2481   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2482   if (!RHS)
2483     return SDValue();
2484
2485   unsigned ShiftAmt = RHS->getZExtValue();
2486   if (ShiftAmt < 32)
2487     return SDValue();
2488
2489   // srl i64:x, C for C >= 32
2490   // =>
2491   //   build_pair (srl hi_32(x), C - 32), 0
2492
2493   SelectionDAG &DAG = DCI.DAG;
2494   SDLoc SL(N);
2495
2496   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2497   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2498
2499   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
2500   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
2501                            VecOp, One);
2502
2503   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
2504   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
2505
2506   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
2507
2508   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
2509 }
2510
2511 // We need to specifically handle i64 mul here to avoid unnecessary conversion
2512 // instructions. If we only match on the legalized i64 mul expansion,
2513 // SimplifyDemandedBits will be unable to remove them because there will be
2514 // multiple uses due to the separate mul + mulh[su].
2515 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
2516                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
2517   if (Size <= 32) {
2518     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2519     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
2520   }
2521
2522   // Because we want to eliminate extension instructions before the
2523   // operation, we need to create a single user here (i.e. not the separate
2524   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
2525
2526   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
2527
2528   SDValue Mul = DAG.getNode(MulOpc, SL,
2529                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
2530
2531   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
2532                      Mul.getValue(0), Mul.getValue(1));
2533 }
2534
2535 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
2536                                                 DAGCombinerInfo &DCI) const {
2537   EVT VT = N->getValueType(0);
2538
2539   unsigned Size = VT.getSizeInBits();
2540   if (VT.isVector() || Size > 64)
2541     return SDValue();
2542
2543   // There are i16 integer mul/mad.
2544   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
2545     return SDValue();
2546
2547   SelectionDAG &DAG = DCI.DAG;
2548   SDLoc DL(N);
2549
2550   SDValue N0 = N->getOperand(0);
2551   SDValue N1 = N->getOperand(1);
2552   SDValue Mul;
2553
2554   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
2555     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2556     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2557     Mul = getMul24(DAG, DL, N0, N1, Size, false);
2558   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
2559     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2560     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2561     Mul = getMul24(DAG, DL, N0, N1, Size, true);
2562   } else {
2563     return SDValue();
2564   }
2565
2566   // We need to use sext even for MUL_U24, because MUL_U24 is used
2567   // for signed multiply of 8 and 16-bit types.
2568   return DAG.getSExtOrTrunc(Mul, DL, VT);
2569 }
2570
2571 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
2572                                                   DAGCombinerInfo &DCI) const {
2573   EVT VT = N->getValueType(0);
2574
2575   if (!Subtarget->hasMulI24() || VT.isVector())
2576     return SDValue();
2577
2578   SelectionDAG &DAG = DCI.DAG;
2579   SDLoc DL(N);
2580
2581   SDValue N0 = N->getOperand(0);
2582   SDValue N1 = N->getOperand(1);
2583
2584   if (!isI24(N0, DAG) || !isI24(N1, DAG))
2585     return SDValue();
2586
2587   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2588   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2589
2590   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
2591   DCI.AddToWorklist(Mulhi.getNode());
2592   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
2593 }
2594
2595 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
2596                                                   DAGCombinerInfo &DCI) const {
2597   EVT VT = N->getValueType(0);
2598
2599   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
2600     return SDValue();
2601
2602   SelectionDAG &DAG = DCI.DAG;
2603   SDLoc DL(N);
2604
2605   SDValue N0 = N->getOperand(0);
2606   SDValue N1 = N->getOperand(1);
2607
2608   if (!isU24(N0, DAG) || !isU24(N1, DAG))
2609     return SDValue();
2610
2611   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2612   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2613
2614   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
2615   DCI.AddToWorklist(Mulhi.getNode());
2616   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
2617 }
2618
2619 SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
2620   SDNode *N, DAGCombinerInfo &DCI) const {
2621   SelectionDAG &DAG = DCI.DAG;
2622
2623   // Simplify demanded bits before splitting into multiple users.
2624   if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
2625     return SDValue();
2626
2627   SDValue N0 = N->getOperand(0);
2628   SDValue N1 = N->getOperand(1);
2629
2630   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
2631
2632   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2633   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
2634
2635   SDLoc SL(N);
2636
2637   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
2638   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
2639   return DAG.getMergeValues({ MulLo, MulHi }, SL);
2640 }
2641
2642 static bool isNegativeOne(SDValue Val) {
2643   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
2644     return C->isAllOnesValue();
2645   return false;
2646 }
2647
2648 static bool isCtlzOpc(unsigned Opc) {
2649   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2650 }
2651
2652 SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
2653                                           SDValue Op,
2654                                           const SDLoc &DL) const {
2655   EVT VT = Op.getValueType();
2656   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
2657   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
2658                               LegalVT != MVT::i16))
2659     return SDValue();
2660
2661   if (VT != MVT::i32)
2662     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
2663
2664   SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
2665   if (VT != MVT::i32)
2666     FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
2667
2668   return FFBH;
2669 }
2670
2671 // The native instructions return -1 on 0 input. Optimize out a select that
2672 // produces -1 on 0.
2673 //
2674 // TODO: If zero is not undef, we could also do this if the output is compared
2675 // against the bitwidth.
2676 //
2677 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
2678 SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
2679                                                  SDValue LHS, SDValue RHS,
2680                                                  DAGCombinerInfo &DCI) const {
2681   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
2682   if (!CmpRhs || !CmpRhs->isNullValue())
2683     return SDValue();
2684
2685   SelectionDAG &DAG = DCI.DAG;
2686   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2687   SDValue CmpLHS = Cond.getOperand(0);
2688
2689   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
2690   if (CCOpcode == ISD::SETEQ &&
2691       isCtlzOpc(RHS.getOpcode()) &&
2692       RHS.getOperand(0) == CmpLHS &&
2693       isNegativeOne(LHS)) {
2694     return getFFBH_U32(DAG, CmpLHS, SL);
2695   }
2696
2697   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
2698   if (CCOpcode == ISD::SETNE &&
2699       isCtlzOpc(LHS.getOpcode()) &&
2700       LHS.getOperand(0) == CmpLHS &&
2701       isNegativeOne(RHS)) {
2702     return getFFBH_U32(DAG, CmpLHS, SL);
2703   }
2704
2705   return SDValue();
2706 }
2707
2708 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
2709                                          unsigned Op,
2710                                          const SDLoc &SL,
2711                                          SDValue Cond,
2712                                          SDValue N1,
2713                                          SDValue N2) {
2714   SelectionDAG &DAG = DCI.DAG;
2715   EVT VT = N1.getValueType();
2716
2717   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
2718                                   N1.getOperand(0), N2.getOperand(0));
2719   DCI.AddToWorklist(NewSelect.getNode());
2720   return DAG.getNode(Op, SL, VT, NewSelect);
2721 }
2722
2723 // Pull a free FP operation out of a select so it may fold into uses.
2724 //
2725 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
2726 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
2727 //
2728 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
2729 // select c, (fabs x), +k -> fabs (select c, x, k)
2730 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
2731                                     SDValue N) {
2732   SelectionDAG &DAG = DCI.DAG;
2733   SDValue Cond = N.getOperand(0);
2734   SDValue LHS = N.getOperand(1);
2735   SDValue RHS = N.getOperand(2);
2736
2737   EVT VT = N.getValueType();
2738   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
2739       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
2740     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
2741                                      SDLoc(N), Cond, LHS, RHS);
2742   }
2743
2744   bool Inv = false;
2745   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
2746     std::swap(LHS, RHS);
2747     Inv = true;
2748   }
2749
2750   // TODO: Support vector constants.
2751   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
2752   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
2753     SDLoc SL(N);
2754     // If one side is an fneg/fabs and the other is a constant, we can push the
2755     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
2756     SDValue NewLHS = LHS.getOperand(0);
2757     SDValue NewRHS = RHS;
2758
2759     // Careful: if the neg can be folded up, don't try to pull it back down.
2760     bool ShouldFoldNeg = true;
2761
2762     if (NewLHS.hasOneUse()) {
2763       unsigned Opc = NewLHS.getOpcode();
2764       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
2765         ShouldFoldNeg = false;
2766       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
2767         ShouldFoldNeg = false;
2768     }
2769
2770     if (ShouldFoldNeg) {
2771       if (LHS.getOpcode() == ISD::FNEG)
2772         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2773       else if (CRHS->isNegative())
2774         return SDValue();
2775
2776       if (Inv)
2777         std::swap(NewLHS, NewRHS);
2778
2779       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
2780                                       Cond, NewLHS, NewRHS);
2781       DCI.AddToWorklist(NewSelect.getNode());
2782       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
2783     }
2784   }
2785
2786   return SDValue();
2787 }
2788
2789
2790 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
2791                                                    DAGCombinerInfo &DCI) const {
2792   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
2793     return Folded;
2794
2795   SDValue Cond = N->getOperand(0);
2796   if (Cond.getOpcode() != ISD::SETCC)
2797     return SDValue();
2798
2799   EVT VT = N->getValueType(0);
2800   SDValue LHS = Cond.getOperand(0);
2801   SDValue RHS = Cond.getOperand(1);
2802   SDValue CC = Cond.getOperand(2);
2803
2804   SDValue True = N->getOperand(1);
2805   SDValue False = N->getOperand(2);
2806
2807   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
2808     SelectionDAG &DAG = DCI.DAG;
2809     if ((DAG.isConstantValueOfAnyType(True) ||
2810          DAG.isConstantValueOfAnyType(True)) &&
2811         (!DAG.isConstantValueOfAnyType(False) &&
2812          !DAG.isConstantValueOfAnyType(False))) {
2813       // Swap cmp + select pair to move constant to false input.
2814       // This will allow using VOPC cndmasks more often.
2815       // select (setcc x, y), k, x -> select (setcc y, x) x, x
2816
2817       SDLoc SL(N);
2818       ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
2819                                             LHS.getValueType().isInteger());
2820
2821       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
2822       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
2823     }
2824   }
2825
2826   if (VT == MVT::f32 && Cond.hasOneUse()) {
2827     SDValue MinMax
2828       = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
2829     // Revisit this node so we can catch min3/max3/med3 patterns.
2830     //DCI.AddToWorklist(MinMax.getNode());
2831     return MinMax;
2832   }
2833
2834   // There's no reason to not do this if the condition has other uses.
2835   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
2836 }
2837
2838 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
2839                                                  DAGCombinerInfo &DCI) const {
2840   SelectionDAG &DAG = DCI.DAG;
2841   SDValue N0 = N->getOperand(0);
2842   EVT VT = N->getValueType(0);
2843
2844   unsigned Opc = N0.getOpcode();
2845
2846   // If the input has multiple uses and we can either fold the negate down, or
2847   // the other uses cannot, give up. This both prevents unprofitable
2848   // transformations and infinite loops: we won't repeatedly try to fold around
2849   // a negate that has no 'good' form.
2850   //
2851   // TODO: Check users can fold
2852   if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
2853     return SDValue();
2854
2855   SDLoc SL(N);
2856   switch (Opc) {
2857   case ISD::FADD: {
2858     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
2859     SDValue LHS = N0.getOperand(0);
2860     SDValue RHS = N0.getOperand(1);
2861
2862     if (LHS.getOpcode() != ISD::FNEG)
2863       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
2864     else
2865       LHS = LHS.getOperand(0);
2866
2867     if (RHS.getOpcode() != ISD::FNEG)
2868       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2869     else
2870       RHS = RHS.getOperand(0);
2871
2872     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
2873     if (!N0.hasOneUse())
2874       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
2875     return Res;
2876   }
2877   case ISD::FMUL:
2878   case AMDGPUISD::FMUL_LEGACY: {
2879     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
2880     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
2881     SDValue LHS = N0.getOperand(0);
2882     SDValue RHS = N0.getOperand(1);
2883
2884     if (LHS.getOpcode() == ISD::FNEG)
2885       LHS = LHS.getOperand(0);
2886     else if (RHS.getOpcode() == ISD::FNEG)
2887       RHS = RHS.getOperand(0);
2888     else
2889       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2890
2891     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
2892     if (!N0.hasOneUse())
2893       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
2894     return Res;
2895   }
2896   case ISD::FMA:
2897   case ISD::FMAD: {
2898     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
2899     SDValue LHS = N0.getOperand(0);
2900     SDValue MHS = N0.getOperand(1);
2901     SDValue RHS = N0.getOperand(2);
2902
2903     if (LHS.getOpcode() == ISD::FNEG)
2904       LHS = LHS.getOperand(0);
2905     else if (MHS.getOpcode() == ISD::FNEG)
2906       MHS = MHS.getOperand(0);
2907     else
2908       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
2909
2910     if (RHS.getOpcode() != ISD::FNEG)
2911       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2912     else
2913       RHS = RHS.getOperand(0);
2914
2915     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
2916     if (!N0.hasOneUse())
2917       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
2918     return Res;
2919   }
2920   case ISD::FP_EXTEND:
2921   case AMDGPUISD::RCP:
2922   case AMDGPUISD::RCP_LEGACY:
2923   case ISD::FSIN:
2924   case AMDGPUISD::SIN_HW: {
2925     SDValue CvtSrc = N0.getOperand(0);
2926     if (CvtSrc.getOpcode() == ISD::FNEG) {
2927       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
2928       // (fneg (rcp (fneg x))) -> (rcp x)
2929       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
2930     }
2931
2932     if (!N0.hasOneUse())
2933       return SDValue();
2934
2935     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
2936     // (fneg (rcp x)) -> (rcp (fneg x))
2937     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
2938     return DAG.getNode(Opc, SL, VT, Neg);
2939   }
2940   case ISD::FP_ROUND: {
2941     SDValue CvtSrc = N0.getOperand(0);
2942
2943     if (CvtSrc.getOpcode() == ISD::FNEG) {
2944       // (fneg (fp_round (fneg x))) -> (fp_round x)
2945       return DAG.getNode(ISD::FP_ROUND, SL, VT,
2946                          CvtSrc.getOperand(0), N0.getOperand(1));
2947     }
2948
2949     if (!N0.hasOneUse())
2950       return SDValue();
2951
2952     // (fneg (fp_round x)) -> (fp_round (fneg x))
2953     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
2954     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
2955   }
2956   default:
2957     return SDValue();
2958   }
2959 }
2960
2961 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
2962                                                 DAGCombinerInfo &DCI) const {
2963   SelectionDAG &DAG = DCI.DAG;
2964   SDLoc DL(N);
2965
2966   switch(N->getOpcode()) {
2967   default:
2968     break;
2969   case ISD::BITCAST: {
2970     EVT DestVT = N->getValueType(0);
2971
2972     // Push casts through vector builds. This helps avoid emitting a large
2973     // number of copies when materializing floating point vector constants.
2974     //
2975     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
2976     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
2977     if (DestVT.isVector()) {
2978       SDValue Src = N->getOperand(0);
2979       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
2980         EVT SrcVT = Src.getValueType();
2981         unsigned NElts = DestVT.getVectorNumElements();
2982
2983         if (SrcVT.getVectorNumElements() == NElts) {
2984           EVT DestEltVT = DestVT.getVectorElementType();
2985
2986           SmallVector<SDValue, 8> CastedElts;
2987           SDLoc SL(N);
2988           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
2989             SDValue Elt = Src.getOperand(I);
2990             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
2991           }
2992
2993           return DAG.getBuildVector(DestVT, SL, CastedElts);
2994         }
2995       }
2996     }
2997
2998     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
2999       break;
3000
3001     // Fold bitcasts of constants.
3002     //
3003     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3004     // TODO: Generalize and move to DAGCombiner
3005     SDValue Src = N->getOperand(0);
3006     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3007       assert(Src.getValueType() == MVT::i64);
3008       SDLoc SL(N);
3009       uint64_t CVal = C->getZExtValue();
3010       return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3011                          DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3012                          DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3013     }
3014
3015     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3016       const APInt &Val = C->getValueAPF().bitcastToAPInt();
3017       SDLoc SL(N);
3018       uint64_t CVal = Val.getZExtValue();
3019       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3020                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3021                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3022
3023       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3024     }
3025
3026     break;
3027   }
3028   case ISD::SHL: {
3029     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3030       break;
3031
3032     return performShlCombine(N, DCI);
3033   }
3034   case ISD::SRL: {
3035     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3036       break;
3037
3038     return performSrlCombine(N, DCI);
3039   }
3040   case ISD::SRA: {
3041     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3042       break;
3043
3044     return performSraCombine(N, DCI);
3045   }
3046   case ISD::MUL:
3047     return performMulCombine(N, DCI);
3048   case ISD::MULHS:
3049     return performMulhsCombine(N, DCI);
3050   case ISD::MULHU:
3051     return performMulhuCombine(N, DCI);
3052   case AMDGPUISD::MUL_I24:
3053   case AMDGPUISD::MUL_U24:
3054   case AMDGPUISD::MULHI_I24:
3055   case AMDGPUISD::MULHI_U24: {
3056     // If the first call to simplify is successfull, then N may end up being
3057     // deleted, so we shouldn't call simplifyI24 again.
3058     simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
3059     return SDValue();
3060   }
3061   case AMDGPUISD::MUL_LOHI_I24:
3062   case AMDGPUISD::MUL_LOHI_U24:
3063     return performMulLoHi24Combine(N, DCI);
3064   case ISD::SELECT:
3065     return performSelectCombine(N, DCI);
3066   case ISD::FNEG:
3067     return performFNegCombine(N, DCI);
3068   case AMDGPUISD::BFE_I32:
3069   case AMDGPUISD::BFE_U32: {
3070     assert(!N->getValueType(0).isVector() &&
3071            "Vector handling of BFE not implemented");
3072     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3073     if (!Width)
3074       break;
3075
3076     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3077     if (WidthVal == 0)
3078       return DAG.getConstant(0, DL, MVT::i32);
3079
3080     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3081     if (!Offset)
3082       break;
3083
3084     SDValue BitsFrom = N->getOperand(0);
3085     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
3086
3087     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
3088
3089     if (OffsetVal == 0) {
3090       // This is already sign / zero extended, so try to fold away extra BFEs.
3091       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
3092
3093       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
3094       if (OpSignBits >= SignBits)
3095         return BitsFrom;
3096
3097       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
3098       if (Signed) {
3099         // This is a sign_extend_inreg. Replace it to take advantage of existing
3100         // DAG Combines. If not eliminated, we will match back to BFE during
3101         // selection.
3102
3103         // TODO: The sext_inreg of extended types ends, although we can could
3104         // handle them in a single BFE.
3105         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
3106                            DAG.getValueType(SmallVT));
3107       }
3108
3109       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
3110     }
3111
3112     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
3113       if (Signed) {
3114         return constantFoldBFE<int32_t>(DAG,
3115                                         CVal->getSExtValue(),
3116                                         OffsetVal,
3117                                         WidthVal,
3118                                         DL);
3119       }
3120
3121       return constantFoldBFE<uint32_t>(DAG,
3122                                        CVal->getZExtValue(),
3123                                        OffsetVal,
3124                                        WidthVal,
3125                                        DL);
3126     }
3127
3128     if ((OffsetVal + WidthVal) >= 32) {
3129       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
3130       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
3131                          BitsFrom, ShiftVal);
3132     }
3133
3134     if (BitsFrom.hasOneUse()) {
3135       APInt Demanded = APInt::getBitsSet(32,
3136                                          OffsetVal,
3137                                          OffsetVal + WidthVal);
3138
3139       APInt KnownZero, KnownOne;
3140       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
3141                                             !DCI.isBeforeLegalizeOps());
3142       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3143       if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
3144           TLI.SimplifyDemandedBits(BitsFrom, Demanded,
3145                                    KnownZero, KnownOne, TLO)) {
3146         DCI.CommitTargetLoweringOpt(TLO);
3147       }
3148     }
3149
3150     break;
3151   }
3152   case ISD::LOAD:
3153     return performLoadCombine(N, DCI);
3154   case ISD::STORE:
3155     return performStoreCombine(N, DCI);
3156   }
3157   return SDValue();
3158 }
3159
3160 //===----------------------------------------------------------------------===//
3161 // Helper functions
3162 //===----------------------------------------------------------------------===//
3163
3164 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3165                                                   const TargetRegisterClass *RC,
3166                                                    unsigned Reg, EVT VT) const {
3167   MachineFunction &MF = DAG.getMachineFunction();
3168   MachineRegisterInfo &MRI = MF.getRegInfo();
3169   unsigned VirtualRegister;
3170   if (!MRI.isLiveIn(Reg)) {
3171     VirtualRegister = MRI.createVirtualRegister(RC);
3172     MRI.addLiveIn(Reg, VirtualRegister);
3173   } else {
3174     VirtualRegister = MRI.getLiveInVirtReg(Reg);
3175   }
3176   return DAG.getRegister(VirtualRegister, VT);
3177 }
3178
3179 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
3180     const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
3181   unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
3182   uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
3183   switch (Param) {
3184   case GRID_DIM:
3185     return ArgOffset;
3186   case GRID_OFFSET:
3187     return ArgOffset + 4;
3188   }
3189   llvm_unreachable("unexpected implicit parameter type");
3190 }
3191
3192 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
3193
3194 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
3195   switch ((AMDGPUISD::NodeType)Opcode) {
3196   case AMDGPUISD::FIRST_NUMBER: break;
3197   // AMDIL DAG nodes
3198   NODE_NAME_CASE(CALL);
3199   NODE_NAME_CASE(UMUL);
3200   NODE_NAME_CASE(BRANCH_COND);
3201
3202   // AMDGPU DAG nodes
3203   NODE_NAME_CASE(ENDPGM)
3204   NODE_NAME_CASE(RETURN)
3205   NODE_NAME_CASE(DWORDADDR)
3206   NODE_NAME_CASE(FRACT)
3207   NODE_NAME_CASE(SETCC)
3208   NODE_NAME_CASE(SETREG)
3209   NODE_NAME_CASE(FMA_W_CHAIN)
3210   NODE_NAME_CASE(FMUL_W_CHAIN)
3211   NODE_NAME_CASE(CLAMP)
3212   NODE_NAME_CASE(COS_HW)
3213   NODE_NAME_CASE(SIN_HW)
3214   NODE_NAME_CASE(FMAX_LEGACY)
3215   NODE_NAME_CASE(FMIN_LEGACY)
3216   NODE_NAME_CASE(FMAX3)
3217   NODE_NAME_CASE(SMAX3)
3218   NODE_NAME_CASE(UMAX3)
3219   NODE_NAME_CASE(FMIN3)
3220   NODE_NAME_CASE(SMIN3)
3221   NODE_NAME_CASE(UMIN3)
3222   NODE_NAME_CASE(FMED3)
3223   NODE_NAME_CASE(SMED3)
3224   NODE_NAME_CASE(UMED3)
3225   NODE_NAME_CASE(URECIP)
3226   NODE_NAME_CASE(DIV_SCALE)
3227   NODE_NAME_CASE(DIV_FMAS)
3228   NODE_NAME_CASE(DIV_FIXUP)
3229   NODE_NAME_CASE(TRIG_PREOP)
3230   NODE_NAME_CASE(RCP)
3231   NODE_NAME_CASE(RSQ)
3232   NODE_NAME_CASE(RCP_LEGACY)
3233   NODE_NAME_CASE(RSQ_LEGACY)
3234   NODE_NAME_CASE(FMUL_LEGACY)
3235   NODE_NAME_CASE(RSQ_CLAMP)
3236   NODE_NAME_CASE(LDEXP)
3237   NODE_NAME_CASE(FP_CLASS)
3238   NODE_NAME_CASE(DOT4)
3239   NODE_NAME_CASE(CARRY)
3240   NODE_NAME_CASE(BORROW)
3241   NODE_NAME_CASE(BFE_U32)
3242   NODE_NAME_CASE(BFE_I32)
3243   NODE_NAME_CASE(BFI)
3244   NODE_NAME_CASE(BFM)
3245   NODE_NAME_CASE(FFBH_U32)
3246   NODE_NAME_CASE(FFBH_I32)
3247   NODE_NAME_CASE(MUL_U24)
3248   NODE_NAME_CASE(MUL_I24)
3249   NODE_NAME_CASE(MULHI_U24)
3250   NODE_NAME_CASE(MULHI_I24)
3251   NODE_NAME_CASE(MUL_LOHI_U24)
3252   NODE_NAME_CASE(MUL_LOHI_I24)
3253   NODE_NAME_CASE(MAD_U24)
3254   NODE_NAME_CASE(MAD_I24)
3255   NODE_NAME_CASE(TEXTURE_FETCH)
3256   NODE_NAME_CASE(EXPORT)
3257   NODE_NAME_CASE(EXPORT_DONE)
3258   NODE_NAME_CASE(R600_EXPORT)
3259   NODE_NAME_CASE(CONST_ADDRESS)
3260   NODE_NAME_CASE(REGISTER_LOAD)
3261   NODE_NAME_CASE(REGISTER_STORE)
3262   NODE_NAME_CASE(LOAD_INPUT)
3263   NODE_NAME_CASE(SAMPLE)
3264   NODE_NAME_CASE(SAMPLEB)
3265   NODE_NAME_CASE(SAMPLED)
3266   NODE_NAME_CASE(SAMPLEL)
3267   NODE_NAME_CASE(CVT_F32_UBYTE0)
3268   NODE_NAME_CASE(CVT_F32_UBYTE1)
3269   NODE_NAME_CASE(CVT_F32_UBYTE2)
3270   NODE_NAME_CASE(CVT_F32_UBYTE3)
3271   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
3272   NODE_NAME_CASE(CONST_DATA_PTR)
3273   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
3274   NODE_NAME_CASE(KILL)
3275   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
3276   NODE_NAME_CASE(SENDMSG)
3277   NODE_NAME_CASE(SENDMSGHALT)
3278   NODE_NAME_CASE(INTERP_MOV)
3279   NODE_NAME_CASE(INTERP_P1)
3280   NODE_NAME_CASE(INTERP_P2)
3281   NODE_NAME_CASE(STORE_MSKOR)
3282   NODE_NAME_CASE(LOAD_CONSTANT)
3283   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
3284   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
3285   NODE_NAME_CASE(ATOMIC_INC)
3286   NODE_NAME_CASE(ATOMIC_DEC)
3287   NODE_NAME_CASE(BUFFER_LOAD)
3288   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
3289   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
3290   }
3291   return nullptr;
3292 }
3293
3294 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
3295                                               SelectionDAG &DAG, int Enabled,
3296                                               int &RefinementSteps,
3297                                               bool &UseOneConstNR,
3298                                               bool Reciprocal) const {
3299   EVT VT = Operand.getValueType();
3300
3301   if (VT == MVT::f32) {
3302     RefinementSteps = 0;
3303     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
3304   }
3305
3306   // TODO: There is also f64 rsq instruction, but the documentation is less
3307   // clear on its precision.
3308
3309   return SDValue();
3310 }
3311
3312 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
3313                                                SelectionDAG &DAG, int Enabled,
3314                                                int &RefinementSteps) const {
3315   EVT VT = Operand.getValueType();
3316
3317   if (VT == MVT::f32) {
3318     // Reciprocal, < 1 ulp error.
3319     //
3320     // This reciprocal approximation converges to < 0.5 ulp error with one
3321     // newton rhapson performed with two fused multiple adds (FMAs).
3322
3323     RefinementSteps = 0;
3324     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
3325   }
3326
3327   // TODO: There is also f64 rcp instruction, but the documentation is less
3328   // clear on its precision.
3329
3330   return SDValue();
3331 }
3332
3333 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
3334   const SDValue Op,
3335   APInt &KnownZero,
3336   APInt &KnownOne,
3337   const SelectionDAG &DAG,
3338   unsigned Depth) const {
3339
3340   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
3341
3342   APInt KnownZero2;
3343   APInt KnownOne2;
3344   unsigned Opc = Op.getOpcode();
3345
3346   switch (Opc) {
3347   default:
3348     break;
3349   case AMDGPUISD::CARRY:
3350   case AMDGPUISD::BORROW: {
3351     KnownZero = APInt::getHighBitsSet(32, 31);
3352     break;
3353   }
3354
3355   case AMDGPUISD::BFE_I32:
3356   case AMDGPUISD::BFE_U32: {
3357     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3358     if (!CWidth)
3359       return;
3360
3361     unsigned BitWidth = 32;
3362     uint32_t Width = CWidth->getZExtValue() & 0x1f;
3363
3364     if (Opc == AMDGPUISD::BFE_U32)
3365       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
3366
3367     break;
3368   }
3369   }
3370 }
3371
3372 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
3373   SDValue Op,
3374   const SelectionDAG &DAG,
3375   unsigned Depth) const {
3376   switch (Op.getOpcode()) {
3377   case AMDGPUISD::BFE_I32: {
3378     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3379     if (!Width)
3380       return 1;
3381
3382     unsigned SignBits = 32 - Width->getZExtValue() + 1;
3383     if (!isNullConstant(Op.getOperand(1)))
3384       return SignBits;
3385
3386     // TODO: Could probably figure something out with non-0 offsets.
3387     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
3388     return std::max(SignBits, Op0SignBits);
3389   }
3390
3391   case AMDGPUISD::BFE_U32: {
3392     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3393     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
3394   }
3395
3396   case AMDGPUISD::CARRY:
3397   case AMDGPUISD::BORROW:
3398     return 31;
3399
3400   default:
3401     return 1;
3402   }
3403 }