contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

   1 //===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Interface definition of the TargetLowering class that is common
  12 /// to all AMD GPUs.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
  17 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
  18
  19 #include "AMDGPU.h"
  20 #include "llvm/CodeGen/CallingConvLower.h"
  21 #include "llvm/Target/TargetLowering.h"
  22
  23 namespace llvm {
  24
  25 class AMDGPUMachineFunction;
  26 class AMDGPUSubtarget;
  27 class MachineRegisterInfo;
  28
  29 class AMDGPUTargetLowering : public TargetLowering {
  30 private:
  31   /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
  32   /// legalized from a smaller type VT. Need to match pre-legalized type because
  33   /// the generic legalization inserts the add/sub between the select and
  34   /// compare.
  35   SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;
  36
  37 public:
  38   static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);
  39
  40 protected:
  41   const AMDGPUSubtarget *Subtarget;
  42   AMDGPUAS AMDGPUASI;
  43
  44   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
  45   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
  46   /// \brief Split a vector store into multiple scalar stores.
  47   /// \returns The resulting chain.
  48
  49   SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
  50   SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
  51   SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
  52   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
  53   SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
  54
  55   SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const;
  56   SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
  57   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
  58   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
  59
  60   SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
  61
  62   SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
  63   SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
  64   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
  65   SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
  66
  67   SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
  68   SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
  69   SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
  70   SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
  71
  72   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
  73
  74 protected:
  75   bool shouldCombineMemoryType(EVT VT) const;
  76   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  77   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  78   SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  79   SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  80
  81   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
  82                                        unsigned Opc, SDValue LHS,
  83                                        uint32_t ValLo, uint32_t ValHi) const;
  84   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  85   SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  86   SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  87   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  88   SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  89   SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  90   SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
  91   SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
  92                              SDValue RHS, DAGCombinerInfo &DCI) const;
  93   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  94   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  95   SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  96
  97   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
  98
  99   virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
 100                                      SelectionDAG &DAG) const;
 101
 102   /// Return 64-bit value Op as two 32-bit integers.
 103   std::pair<SDValue, SDValue> split64BitValue(SDValue Op,
 104                                               SelectionDAG &DAG) const;
 105   SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
 106   SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
 107
 108   /// \brief Split a vector load into 2 loads of half the vector.
 109   SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
 110
 111   /// \brief Split a vector store into 2 stores of half the vector.
 112   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
 113
 114   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 115   SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
 116   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
 117   SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
 118   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
 119                                     SmallVectorImpl<SDValue> &Results) const;
 120   void analyzeFormalArgumentsCompute(CCState &State,
 121                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
 122 public:
 123   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 124
 125   bool mayIgnoreSignedZero(SDValue Op) const {
 126     if (getTargetMachine().Options.NoSignedZerosFPMath)
 127       return true;
 128
 129     const auto Flags = Op.getNode()->getFlags();
 130     if (Flags.isDefined())
 131       return Flags.hasNoSignedZeros();
 132
 133     return false;
 134   }
 135
 136   static bool allUsesHaveSourceMods(const SDNode *N,
 137                                     unsigned CostThreshold = 4);
 138   bool isFAbsFree(EVT VT) const override;
 139   bool isFNegFree(EVT VT) const override;
 140   bool isTruncateFree(EVT Src, EVT Dest) const override;
 141   bool isTruncateFree(Type *Src, Type *Dest) const override;
 142
 143   bool isZExtFree(Type *Src, Type *Dest) const override;
 144   bool isZExtFree(EVT Src, EVT Dest) const override;
 145   bool isZExtFree(SDValue Val, EVT VT2) const override;
 146
 147   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 148
 149   MVT getVectorIdxTy(const DataLayout &) const override;
 150   bool isSelectSupported(SelectSupportKind) const override;
 151
 152   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 153   bool ShouldShrinkFPConstant(EVT VT) const override;
 154   bool shouldReduceLoadWidth(SDNode *Load,
 155                              ISD::LoadExtType ExtType,
 156                              EVT ExtVT) const override;
 157
 158   bool isLoadBitCastBeneficial(EVT, EVT) const final;
 159
 160   bool storeOfVectorConstantIsCheap(EVT MemVT,
 161                                     unsigned NumElem,
 162                                     unsigned AS) const override;
 163   bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
 164   bool isCheapToSpeculateCttz() const override;
 165   bool isCheapToSpeculateCtlz() const override;
 166
 167   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
 168   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 169
 170   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 171                       const SmallVectorImpl<ISD::OutputArg> &Outs,
 172                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
 173                       SelectionDAG &DAG) const override;
 174   SDValue LowerCall(CallLoweringInfo &CLI,
 175                     SmallVectorImpl<SDValue> &InVals) const override;
 176
 177   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
 178                                   SelectionDAG &DAG) const;
 179
 180   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 181   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 182   void ReplaceNodeResults(SDNode * N,
 183                           SmallVectorImpl<SDValue> &Results,
 184                           SelectionDAG &DAG) const override;
 185
 186   SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
 187                                SDValue RHS, SDValue True, SDValue False,
 188                                SDValue CC, DAGCombinerInfo &DCI) const;
 189
 190   const char* getTargetNodeName(unsigned Opcode) const override;
 191
 192   bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
 193     return true;
 194   }
 195   SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
 196                            int &RefinementSteps, bool &UseOneConstNR,
 197                            bool Reciprocal) const override;
 198   SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
 199                            int &RefinementSteps) const override;
 200
 201   virtual SDNode *PostISelFolding(MachineSDNode *N,
 202                                   SelectionDAG &DAG) const = 0;
 203
 204   /// \brief Determine which of the bits specified in \p Mask are known to be
 205   /// either zero or one and return them in the \p KnownZero and \p KnownOne
 206   /// bitsets.
 207   void computeKnownBitsForTargetNode(const SDValue Op,
 208                                      KnownBits &Known,
 209                                      const APInt &DemandedElts,
 210                                      const SelectionDAG &DAG,
 211                                      unsigned Depth = 0) const override;
 212
 213   unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts,
 214                                            const SelectionDAG &DAG,
 215                                            unsigned Depth = 0) const override;
 216
 217   /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
 218   /// MachineFunction.
 219   ///
 220   /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
 221   /// a copy from the register.
 222   SDValue CreateLiveInRegister(SelectionDAG &DAG,
 223                                const TargetRegisterClass *RC,
 224                                unsigned Reg, EVT VT,
 225                                const SDLoc &SL,
 226                                bool RawReg = false) const;
 227   SDValue CreateLiveInRegister(SelectionDAG &DAG,
 228                                const TargetRegisterClass *RC,
 229                                unsigned Reg, EVT VT) const {
 230     return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
 231   }
 232
 233   // Returns the raw live in register rather than a copy from it.
 234   SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
 235                                   const TargetRegisterClass *RC,
 236                                   unsigned Reg, EVT VT) const {
 237     return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
 238   }
 239
 240   enum ImplicitParameter {
 241     FIRST_IMPLICIT,
 242     GRID_DIM = FIRST_IMPLICIT,
 243     GRID_OFFSET,
 244   };
 245
 246   /// \brief Helper function that returns the byte offset of the given
 247   /// type of implicit parameter.
 248   uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
 249                                       const ImplicitParameter Param) const;
 250
 251   AMDGPUAS getAMDGPUAS() const {
 252     return AMDGPUASI;
 253   }
 254
 255   MVT getFenceOperandTy(const DataLayout &DL) const override {
 256     return MVT::i32;
 257   }
 258 };
 259
 260 namespace AMDGPUISD {
 261
 262 enum NodeType : unsigned {
 263   // AMDIL ISD Opcodes
 264   FIRST_NUMBER = ISD::BUILTIN_OP_END,
 265   UMUL,        // 32bit unsigned multiplication
 266   BRANCH_COND,
 267   // End AMDIL ISD Opcodes
 268
 269   // Function call.
 270   CALL,
 271   TRAP,
 272
 273   // Masked control flow nodes.
 274   IF,
 275   ELSE,
 276   LOOP,
 277
 278   // A uniform kernel return that terminates the wavefront.
 279   ENDPGM,
 280
 281   // Return to a shader part's epilog code.
 282   RETURN_TO_EPILOG,
 283
 284   // Return with values from a non-entry function.
 285   RET_FLAG,
 286
 287   DWORDADDR,
 288   FRACT,
 289
 290   /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
 291   /// modifier behavior with dx10_enable.
 292   CLAMP,
 293
 294   // This is SETCC with the full mask result which is used for a compare with a
 295   // result bit per item in the wavefront.
 296   SETCC,
 297   SETREG,
 298   // FP ops with input and output chain.
 299   FMA_W_CHAIN,
 300   FMUL_W_CHAIN,
 301
 302   // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
 303   // Denormals handled on some parts.
 304   COS_HW,
 305   SIN_HW,
 306   FMAX_LEGACY,
 307   FMIN_LEGACY,
 308   FMAX3,
 309   SMAX3,
 310   UMAX3,
 311   FMIN3,
 312   SMIN3,
 313   UMIN3,
 314   FMED3,
 315   SMED3,
 316   UMED3,
 317   URECIP,
 318   DIV_SCALE,
 319   DIV_FMAS,
 320   DIV_FIXUP,
 321   // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
 322   // treated as an illegal operation.
 323   FMAD_FTZ,
 324   TRIG_PREOP, // 1 ULP max error for f64
 325
 326   // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
 327   //            For f64, max error 2^29 ULP, handles denormals.
 328   RCP,
 329   RSQ,
 330   RCP_LEGACY,
 331   RSQ_LEGACY,
 332   FMUL_LEGACY,
 333   RSQ_CLAMP,
 334   LDEXP,
 335   FP_CLASS,
 336   DOT4,
 337   CARRY,
 338   BORROW,
 339   BFE_U32, // Extract range of bits with zero extension to 32-bits.
 340   BFE_I32, // Extract range of bits with sign extension to 32-bits.
 341   BFI, // (src0 & src1) | (~src0 & src2)
 342   BFM, // Insert a range of bits into a 32-bit word.
 343   FFBH_U32, // ctlz with -1 if input is zero.
 344   FFBH_I32,
 345   MUL_U24,
 346   MUL_I24,
 347   MULHI_U24,
 348   MULHI_I24,
 349   MAD_U24,
 350   MAD_I24,
 351   MUL_LOHI_I24,
 352   MUL_LOHI_U24,
 353   TEXTURE_FETCH,
 354   EXPORT, // exp on SI+
 355   EXPORT_DONE, // exp on SI+ with done bit set
 356   R600_EXPORT,
 357   CONST_ADDRESS,
 358   REGISTER_LOAD,
 359   REGISTER_STORE,
 360   SAMPLE,
 361   SAMPLEB,
 362   SAMPLED,
 363   SAMPLEL,
 364
 365   // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
 366   CVT_F32_UBYTE0,
 367   CVT_F32_UBYTE1,
 368   CVT_F32_UBYTE2,
 369   CVT_F32_UBYTE3,
 370
 371   // Convert two float 32 numbers into a single register holding two packed f16
 372   // with round to zero.
 373   CVT_PKRTZ_F16_F32,
 374
 375   // Same as the standard node, except the high bits of the resulting integer
 376   // are known 0.
 377   FP_TO_FP16,
 378
 379   // Wrapper around fp16 results that are known to zero the high bits.
 380   FP16_ZEXT,
 381
 382   /// This node is for VLIW targets and it is used to represent a vector
 383   /// that is stored in consecutive registers with the same channel.
 384   /// For example:
 385   ///   |X  |Y|Z|W|
 386   /// T0|v.x| | | |
 387   /// T1|v.y| | | |
 388   /// T2|v.z| | | |
 389   /// T3|v.w| | | |
 390   BUILD_VERTICAL_VECTOR,
 391   /// Pointer to the start of the shader's constant data.
 392   CONST_DATA_PTR,
 393   INIT_EXEC,
 394   INIT_EXEC_FROM_INPUT,
 395   SENDMSG,
 396   SENDMSGHALT,
 397   INTERP_MOV,
 398   INTERP_P1,
 399   INTERP_P2,
 400   PC_ADD_REL_OFFSET,
 401   KILL,
 402   DUMMY_CHAIN,
 403   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
 404   STORE_MSKOR,
 405   LOAD_CONSTANT,
 406   TBUFFER_STORE_FORMAT,
 407   TBUFFER_STORE_FORMAT_X3,
 408   TBUFFER_LOAD_FORMAT,
 409   ATOMIC_CMP_SWAP,
 410   ATOMIC_INC,
 411   ATOMIC_DEC,
 412   BUFFER_LOAD,
 413   BUFFER_LOAD_FORMAT,
 414   LAST_AMDGPU_ISD_NUMBER
 415 };
 416
 417
 418 } // End namespace AMDGPUISD
 419
 420 } // End namespace llvm
 421
 422 #endif