contrib/llvm/lib/Target/AMDGPU/SIInstructions.td

   1 //===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 // This file was originally auto-generated from a GPU register header file and
  10 // all the instruction definitions were originally commented out.  Instructions
  11 // that are not yet supported remain commented out.
  12 //===----------------------------------------------------------------------===//
  13
  14 def isGCN : Predicate<"Subtarget->getGeneration() "
  15                       ">= SISubtarget::SOUTHERN_ISLANDS">,
  16             AssemblerPredicate<"FeatureGCN">;
  17 def isSI : Predicate<"Subtarget->getGeneration() "
  18                       "== SISubtarget::SOUTHERN_ISLANDS">,
  19            AssemblerPredicate<"FeatureSouthernIslands">;
  20
  21 def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
  22 def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
  23 def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
  24                       AssemblerPredicate<"FeatureVGPRIndexMode">;
  25 def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
  26                 AssemblerPredicate<"FeatureMovrel">;
  27
  28 include "VOPInstructions.td"
  29 include "SOPInstructions.td"
  30 include "SMInstructions.td"
  31 include "FLATInstructions.td"
  32 include "BUFInstructions.td"
  33
  34 let SubtargetPredicate = isGCN in {
  35
  36 //===----------------------------------------------------------------------===//
  37 // EXP Instructions
  38 //===----------------------------------------------------------------------===//
  39
  40 defm EXP : EXP_m<0, AMDGPUexport>;
  41 defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
  42
  43 //===----------------------------------------------------------------------===//
  44 // VINTRP Instructions
  45 //===----------------------------------------------------------------------===//
  46
  47 let Uses = [M0, EXEC] in {
  48
  49 // FIXME: Specify SchedRW for VINTRP insturctions.
  50
  51 multiclass V_INTERP_P1_F32_m : VINTRP_m <
  52   0x00000000,
  53   (outs VGPR_32:$vdst),
  54   (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
  55   "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
  56   [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
  57                                                (i32 imm:$attr)))]
  58 >;
  59
  60 let OtherPredicates = [has32BankLDS] in {
  61
  62 defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
  63
  64 } // End OtherPredicates = [has32BankLDS]
  65
  66 let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
  67
  68 defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
  69
  70 } // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
  71
  72 let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
  73
  74 defm V_INTERP_P2_F32 : VINTRP_m <
  75   0x00000001,
  76   (outs VGPR_32:$vdst),
  77   (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
  78   "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
  79   [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
  80                                                           (i32 imm:$attr)))]>;
  81
  82 } // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
  83
  84 defm V_INTERP_MOV_F32 : VINTRP_m <
  85   0x00000002,
  86   (outs VGPR_32:$vdst),
  87   (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
  88   "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
  89   [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
  90                                      (i32 imm:$attr)))]>;
  91
  92 } // End Uses = [M0, EXEC]
  93
  94 //===----------------------------------------------------------------------===//
  95 // Pseudo Instructions
  96 //===----------------------------------------------------------------------===//
  97 def ATOMIC_FENCE : SPseudoInstSI<
  98   (outs), (ins i32imm:$ordering, i32imm:$scope),
  99   [(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))],
 100   "ATOMIC_FENCE $ordering, $scope"> {
 101   let hasSideEffects = 1;
 102 }
 103
 104 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 105
 106 // For use in patterns
 107 def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
 108   (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
 109   let isPseudo = 1;
 110   let isCodeGenOnly = 1;
 111   let usesCustomInserter = 1;
 112 }
 113
 114 // 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
 115 // pass to enable folding of inline immediates.
 116 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
 117                                       (ins VSrc_b64:$src0)>;
 118 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 119
 120 let usesCustomInserter = 1, SALU = 1 in {
 121 def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
 122   [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
 123 } // End let usesCustomInserter = 1, SALU = 1
 124
 125 def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
 126    (ins SSrc_b64:$src0)> {
 127   let SALU = 1;
 128   let isAsCheapAsAMove = 1;
 129   let isTerminator = 1;
 130 }
 131
 132 def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
 133    (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
 134   let SALU = 1;
 135   let isAsCheapAsAMove = 1;
 136   let isTerminator = 1;
 137 }
 138
 139 def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
 140    (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
 141   let SALU = 1;
 142   let isAsCheapAsAMove = 1;
 143   let isTerminator = 1;
 144 }
 145
 146 def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
 147   [(int_amdgcn_wave_barrier)]> {
 148   let SchedRW = [];
 149   let hasNoSchedulingInfo = 1;
 150   let hasSideEffects = 1;
 151   let mayLoad = 1;
 152   let mayStore = 1;
 153   let isBarrier = 1;
 154   let isConvergent = 1;
 155   let FixedSize = 1;
 156   let Size = 0;
 157 }
 158
 159 // SI pseudo instructions. These are used by the CFG structurizer pass
 160 // and should be lowered to ISA instructions prior to codegen.
 161
 162 // Dummy terminator instruction to use after control flow instructions
 163 // replaced with exec mask operations.
 164 def SI_MASK_BRANCH : VPseudoInstSI <
 165   (outs), (ins brtarget:$target)> {
 166   let isBranch = 0;
 167   let isTerminator = 1;
 168   let isBarrier = 0;
 169   let SchedRW = [];
 170   let hasNoSchedulingInfo = 1;
 171   let FixedSize = 1;
 172   let Size = 0;
 173 }
 174
 175 let isTerminator = 1 in {
 176
 177  def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
 178   (outs),
 179   (ins SReg_64:$vcc, brtarget:$target),
 180   [(brcond i1:$vcc, bb:$target)]> {
 181     let Size = 12;
 182 }
 183
 184 def SI_IF: CFPseudoInstSI <
 185   (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
 186   [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
 187   let Constraints = "";
 188   let Size = 12;
 189   let hasSideEffects = 1;
 190 }
 191
 192 def SI_ELSE : CFPseudoInstSI <
 193   (outs SReg_64:$dst),
 194   (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
 195   let Constraints = "$src = $dst";
 196   let Size = 12;
 197   let hasSideEffects = 1;
 198 }
 199
 200 def SI_LOOP : CFPseudoInstSI <
 201   (outs), (ins SReg_64:$saved, brtarget:$target),
 202   [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
 203   let Size = 8;
 204   let isBranch = 0;
 205   let hasSideEffects = 1;
 206 }
 207
 208 } // End isTerminator = 1
 209
 210 def SI_END_CF : CFPseudoInstSI <
 211   (outs), (ins SReg_64:$saved),
 212   [(int_amdgcn_end_cf i64:$saved)], 1, 1> {
 213   let Size = 4;
 214   let isAsCheapAsAMove = 1;
 215   let isReMaterializable = 1;
 216   let hasSideEffects = 1;
 217   let mayLoad = 1; // FIXME: Should not need memory flags
 218   let mayStore = 1;
 219 }
 220
 221 def SI_BREAK : CFPseudoInstSI <
 222   (outs SReg_64:$dst), (ins SReg_64:$src),
 223   [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
 224   let Size = 4;
 225   let isAsCheapAsAMove = 1;
 226   let isReMaterializable = 1;
 227 }
 228
 229 def SI_IF_BREAK : CFPseudoInstSI <
 230   (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
 231   [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
 232   let Size = 4;
 233   let isAsCheapAsAMove = 1;
 234   let isReMaterializable = 1;
 235 }
 236
 237 def SI_ELSE_BREAK : CFPseudoInstSI <
 238   (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
 239   [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
 240   let Size = 4;
 241   let isAsCheapAsAMove = 1;
 242   let isReMaterializable = 1;
 243 }
 244
 245 let Uses = [EXEC], Defs = [EXEC,VCC] in {
 246 def SI_KILL : PseudoInstSI <
 247   (outs), (ins VSrc_b32:$src),
 248   [(AMDGPUkill i32:$src)]> {
 249   let isConvergent = 1;
 250   let usesCustomInserter = 1;
 251 }
 252
 253 def SI_KILL_TERMINATOR : SPseudoInstSI <
 254   (outs), (ins VSrc_b32:$src)> {
 255   let isTerminator = 1;
 256 }
 257
 258 def SI_ILLEGAL_COPY : SPseudoInstSI <
 259   (outs unknown:$dst), (ins unknown:$src),
 260   [], " ; illegal copy $src to $dst">;
 261
 262 } // End Uses = [EXEC], Defs = [EXEC,VCC]
 263
 264 // Branch on undef scc. Used to avoid intermediate copy from
 265 // IMPLICIT_DEF to SCC.
 266 def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
 267   let isTerminator = 1;
 268   let usesCustomInserter = 1;
 269 }
 270
 271 def SI_PS_LIVE : PseudoInstSI <
 272   (outs SReg_64:$dst), (ins),
 273   [(set i1:$dst, (int_amdgcn_ps_live))]> {
 274   let SALU = 1;
 275 }
 276
 277 def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
 278   [(int_amdgcn_unreachable)],
 279   "; divergent unreachable"> {
 280   let Size = 0;
 281   let hasNoSchedulingInfo = 1;
 282   let FixedSize = 1;
 283 }
 284
 285 // Used as an isel pseudo to directly emit initialization with an
 286 // s_mov_b32 rather than a copy of another initialized
 287 // register. MachineCSE skips copies, and we don't want to have to
 288 // fold operands before it runs.
 289 def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
 290   let Defs = [M0];
 291   let usesCustomInserter = 1;
 292   let isAsCheapAsAMove = 1;
 293   let isReMaterializable = 1;
 294 }
 295
 296 def SI_INIT_EXEC : SPseudoInstSI <
 297   (outs), (ins i64imm:$src), []> {
 298   let Defs = [EXEC];
 299   let usesCustomInserter = 1;
 300   let isAsCheapAsAMove = 1;
 301 }
 302
 303 def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
 304   (outs), (ins SSrc_b32:$input, i32imm:$shift), []> {
 305   let Defs = [EXEC];
 306   let usesCustomInserter = 1;
 307 }
 308
 309 // Return for returning shaders to a shader variant epilog.
 310 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
 311   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
 312   let isTerminator = 1;
 313   let isBarrier = 1;
 314   let isReturn = 1;
 315   let hasNoSchedulingInfo = 1;
 316   let DisableWQM = 1;
 317 }
 318
 319 let Defs = [M0, EXEC],
 320   UseNamedOperandTable = 1 in {
 321
 322 class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
 323   (outs VGPR_32:$vdst),
 324   (ins rc:$src, VS_32:$idx, i32imm:$offset)> {
 325   let usesCustomInserter = 1;
 326 }
 327
 328 class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
 329   (outs rc:$vdst),
 330   (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
 331   let Constraints = "$src = $vdst";
 332   let usesCustomInserter = 1;
 333 }
 334
 335 // TODO: We can support indirect SGPR access.
 336 def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
 337 def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
 338 def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
 339 def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
 340 def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
 341
 342 def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
 343 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 344 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 345 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
 346 def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 347
 348 } // End Uses = [EXEC], Defs = [M0, EXEC]
 349
 350 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 351   let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
 352     def _SAVE : PseudoInstSI <
 353       (outs),
 354       (ins sgpr_class:$data, i32imm:$addr)> {
 355       let mayStore = 1;
 356       let mayLoad = 0;
 357     }
 358
 359     def _RESTORE : PseudoInstSI <
 360       (outs sgpr_class:$data),
 361       (ins i32imm:$addr)> {
 362       let mayStore = 0;
 363       let mayLoad = 1;
 364     }
 365   } // End UseNamedOperandTable = 1
 366 }
 367
 368 // You cannot use M0 as the output of v_readlane_b32 instructions or
 369 // use it in the sdata operand of SMEM instructions. We still need to
 370 // be able to spill the physical register m0, so allow it for
 371 // SI_SPILL_32_* instructions.
 372 defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32>;
 373 defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
 374 defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
 375 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 376 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 377
 378 multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
 379   let UseNamedOperandTable = 1, VGPRSpill = 1,
 380        SchedRW = [WriteVMEM] in {
 381     def _SAVE : VPseudoInstSI <
 382       (outs),
 383       (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
 384            SReg_32:$soffset, i32imm:$offset)> {
 385       let mayStore = 1;
 386       let mayLoad = 0;
 387       // (2 * 4) + (8 * num_subregs) bytes maximum
 388       let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
 389     }
 390
 391     def _RESTORE : VPseudoInstSI <
 392       (outs vgpr_class:$vdata),
 393       (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
 394            i32imm:$offset)> {
 395       let mayStore = 0;
 396       let mayLoad = 1;
 397
 398       // (2 * 4) + (8 * num_subregs) bytes maximum
 399       let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
 400     }
 401   } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
 402 }
 403
 404 defm SI_SPILL_V32  : SI_SPILL_VGPR <VGPR_32>;
 405 defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
 406 defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
 407 defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
 408 defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
 409 defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
 410
 411 def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
 412   (outs SReg_64:$dst),
 413   (ins si_ga:$ptr_lo, si_ga:$ptr_hi),
 414   [(set SReg_64:$dst,
 415    (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
 416   let Defs = [SCC];
 417 }
 418
 419 } // End SubtargetPredicate = isGCN
 420
 421 let Predicates = [isGCN] in {
 422 def : Pat <
 423   (AMDGPUinit_exec i64:$src),
 424   (SI_INIT_EXEC (as_i64imm $src))
 425 >;
 426
 427 def : Pat <
 428   (AMDGPUinit_exec_from_input i32:$input, i32:$shift),
 429   (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
 430 >;
 431
 432 def : Pat<
 433   (AMDGPUtrap timm:$trapid),
 434   (S_TRAP $trapid)
 435 >;
 436
 437 def : Pat<
 438   (AMDGPUelse i64:$src, bb:$target),
 439   (SI_ELSE $src, $target, 0)
 440 >;
 441
 442 def : Pat <
 443   (int_AMDGPU_kilp),
 444   (SI_KILL (i32 0xbf800000))
 445 >;
 446
 447 //===----------------------------------------------------------------------===//
 448 // VOP1 Patterns
 449 //===----------------------------------------------------------------------===//
 450
 451 let Predicates = [UnsafeFPMath] in {
 452
 453 //def : RcpPat<V_RCP_F64_e32, f64>;
 454 //defm : RsqPat<V_RSQ_F64_e32, f64>;
 455 //defm : RsqPat<V_RSQ_F32_e32, f32>;
 456
 457 def : RsqPat<V_RSQ_F32_e32, f32>;
 458 def : RsqPat<V_RSQ_F64_e32, f64>;
 459
 460 // Convert (x - floor(x)) to fract(x)
 461 def : Pat <
 462   (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
 463              (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
 464   (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
 465 >;
 466
 467 // Convert (x + (-floor(x))) to fract(x)
 468 def : Pat <
 469   (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
 470              (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
 471   (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
 472 >;
 473
 474 } // End Predicates = [UnsafeFPMath]
 475
 476
 477 // f16_to_fp patterns
 478 def : Pat <
 479   (f32 (f16_to_fp i32:$src0)),
 480   (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 481 >;
 482
 483 def : Pat <
 484   (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
 485   (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 486 >;
 487
 488 def : Pat <
 489   (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
 490   (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 491 >;
 492
 493 def : Pat <
 494   (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
 495   (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 496 >;
 497
 498 def : Pat <
 499   (f64 (fpextend f16:$src)),
 500   (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
 501 >;
 502
 503 // fp_to_fp16 patterns
 504 def : Pat <
 505   (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
 506   (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 507 >;
 508
 509 def : Pat <
 510   (i32 (fp_to_sint f16:$src)),
 511   (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
 512 >;
 513
 514 def : Pat <
 515   (i32 (fp_to_uint f16:$src)),
 516   (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
 517 >;
 518
 519 def : Pat <
 520   (f16 (sint_to_fp i32:$src)),
 521   (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
 522 >;
 523
 524 def : Pat <
 525   (f16 (uint_to_fp i32:$src)),
 526   (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
 527 >;
 528
 529 //===----------------------------------------------------------------------===//
 530 // VOP2 Patterns
 531 //===----------------------------------------------------------------------===//
 532
 533 multiclass FMADPat <ValueType vt, Instruction inst> {
 534   def : Pat <
 535     (vt (fmad (VOP3NoMods vt:$src0),
 536               (VOP3NoMods vt:$src1),
 537               (VOP3NoMods vt:$src2))),
 538     (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
 539           SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
 540   >;
 541 }
 542
 543 defm : FMADPat <f16, V_MAC_F16_e64>;
 544 defm : FMADPat <f32, V_MAC_F32_e64>;
 545
 546 class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat<
 547   (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod),
 548   (VOP3Mods f32:$src1, i32:$src1_mod),
 549   (VOP3Mods f32:$src2, i32:$src2_mod))),
 550   (inst $src0_mod, $src0, $src1_mod, $src1,
 551   $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
 552 >;
 553
 554 def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>;
 555
 556 multiclass SelectPat <ValueType vt, Instruction inst> {
 557   def : Pat <
 558     (vt (select i1:$src0, vt:$src1, vt:$src2)),
 559     (inst $src2, $src1, $src0)
 560   >;
 561 }
 562
 563 defm : SelectPat <i16, V_CNDMASK_B32_e64>;
 564 defm : SelectPat <i32, V_CNDMASK_B32_e64>;
 565 defm : SelectPat <f16, V_CNDMASK_B32_e64>;
 566 defm : SelectPat <f32, V_CNDMASK_B32_e64>;
 567
 568 def : Pat <
 569   (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
 570   (V_BCNT_U32_B32_e64 $popcnt, $val)
 571 >;
 572
 573 /********** ============================================ **********/
 574 /********** Extraction, Insertion, Building and Casting  **********/
 575 /********** ============================================ **********/
 576
 577 foreach Index = 0-2 in {
 578   def Extract_Element_v2i32_#Index : Extract_Element <
 579     i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
 580   >;
 581   def Insert_Element_v2i32_#Index : Insert_Element <
 582     i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
 583   >;
 584
 585   def Extract_Element_v2f32_#Index : Extract_Element <
 586     f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
 587   >;
 588   def Insert_Element_v2f32_#Index : Insert_Element <
 589     f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
 590   >;
 591 }
 592
 593 foreach Index = 0-3 in {
 594   def Extract_Element_v4i32_#Index : Extract_Element <
 595     i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
 596   >;
 597   def Insert_Element_v4i32_#Index : Insert_Element <
 598     i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
 599   >;
 600
 601   def Extract_Element_v4f32_#Index : Extract_Element <
 602     f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
 603   >;
 604   def Insert_Element_v4f32_#Index : Insert_Element <
 605     f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
 606   >;
 607 }
 608
 609 foreach Index = 0-7 in {
 610   def Extract_Element_v8i32_#Index : Extract_Element <
 611     i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
 612   >;
 613   def Insert_Element_v8i32_#Index : Insert_Element <
 614     i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
 615   >;
 616
 617   def Extract_Element_v8f32_#Index : Extract_Element <
 618     f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
 619   >;
 620   def Insert_Element_v8f32_#Index : Insert_Element <
 621     f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
 622   >;
 623 }
 624
 625 foreach Index = 0-15 in {
 626   def Extract_Element_v16i32_#Index : Extract_Element <
 627     i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
 628   >;
 629   def Insert_Element_v16i32_#Index : Insert_Element <
 630     i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
 631   >;
 632
 633   def Extract_Element_v16f32_#Index : Extract_Element <
 634     f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
 635   >;
 636   def Insert_Element_v16f32_#Index : Insert_Element <
 637     f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
 638   >;
 639 }
 640
 641 // FIXME: Why do only some of these type combinations for SReg and
 642 // VReg?
 643 // 16-bit bitcast
 644 def : BitConvert <i16, f16, VGPR_32>;
 645 def : BitConvert <f16, i16, VGPR_32>;
 646 def : BitConvert <i16, f16, SReg_32>;
 647 def : BitConvert <f16, i16, SReg_32>;
 648
 649 // 32-bit bitcast
 650 def : BitConvert <i32, f32, VGPR_32>;
 651 def : BitConvert <f32, i32, VGPR_32>;
 652 def : BitConvert <i32, f32, SReg_32>;
 653 def : BitConvert <f32, i32, SReg_32>;
 654 def : BitConvert <v2i16, i32, SReg_32>;
 655 def : BitConvert <i32, v2i16, SReg_32>;
 656 def : BitConvert <v2f16, i32, SReg_32>;
 657 def : BitConvert <i32, v2f16, SReg_32>;
 658 def : BitConvert <v2i16, v2f16, SReg_32>;
 659 def : BitConvert <v2f16, v2i16, SReg_32>;
 660 def : BitConvert <v2f16, f32, SReg_32>;
 661 def : BitConvert <f32, v2f16, SReg_32>;
 662 def : BitConvert <v2i16, f32, SReg_32>;
 663 def : BitConvert <f32, v2i16, SReg_32>;
 664
 665 // 64-bit bitcast
 666 def : BitConvert <i64, f64, VReg_64>;
 667 def : BitConvert <f64, i64, VReg_64>;
 668 def : BitConvert <v2i32, v2f32, VReg_64>;
 669 def : BitConvert <v2f32, v2i32, VReg_64>;
 670 def : BitConvert <i64, v2i32, VReg_64>;
 671 def : BitConvert <v2i32, i64, VReg_64>;
 672 def : BitConvert <i64, v2f32, VReg_64>;
 673 def : BitConvert <v2f32, i64, VReg_64>;
 674 def : BitConvert <f64, v2f32, VReg_64>;
 675 def : BitConvert <v2f32, f64, VReg_64>;
 676 def : BitConvert <f64, v2i32, VReg_64>;
 677 def : BitConvert <v2i32, f64, VReg_64>;
 678 def : BitConvert <v4i32, v4f32, VReg_128>;
 679 def : BitConvert <v4f32, v4i32, VReg_128>;
 680
 681 // 128-bit bitcast
 682 def : BitConvert <v2i64, v4i32, SReg_128>;
 683 def : BitConvert <v4i32, v2i64, SReg_128>;
 684 def : BitConvert <v2f64, v4f32, VReg_128>;
 685 def : BitConvert <v2f64, v4i32, VReg_128>;
 686 def : BitConvert <v4f32, v2f64, VReg_128>;
 687 def : BitConvert <v4i32, v2f64, VReg_128>;
 688 def : BitConvert <v2i64, v2f64, VReg_128>;
 689 def : BitConvert <v2f64, v2i64, VReg_128>;
 690
 691 // 256-bit bitcast
 692 def : BitConvert <v8i32, v8f32, SReg_256>;
 693 def : BitConvert <v8f32, v8i32, SReg_256>;
 694 def : BitConvert <v8i32, v8f32, VReg_256>;
 695 def : BitConvert <v8f32, v8i32, VReg_256>;
 696
 697 // 512-bit bitcast
 698 def : BitConvert <v16i32, v16f32, VReg_512>;
 699 def : BitConvert <v16f32, v16i32, VReg_512>;
 700
 701 /********** =================== **********/
 702 /********** Src & Dst modifiers **********/
 703 /********** =================== **********/
 704
 705
 706 // If denormals are not enabled, it only impacts the compare of the
 707 // inputs. The output result is not flushed.
 708 class ClampPat<Instruction inst, ValueType vt> : Pat <
 709   (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))),
 710   (inst i32:$src0_modifiers, vt:$src0,
 711         i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE)
 712 >;
 713
 714 def : ClampPat<V_MAX_F32_e64, f32>;
 715 def : ClampPat<V_MAX_F64, f64>;
 716 def : ClampPat<V_MAX_F16_e64, f16>;
 717
 718 /********** ================================ **********/
 719 /********** Floating point absolute/negative **********/
 720 /********** ================================ **********/
 721
 722 // Prevent expanding both fneg and fabs.
 723
 724 def : Pat <
 725   (fneg (fabs f32:$src)),
 726   (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
 727 >;
 728
 729 // FIXME: Should use S_OR_B32
 730 def : Pat <
 731   (fneg (fabs f64:$src)),
 732   (REG_SEQUENCE VReg_64,
 733     (i32 (EXTRACT_SUBREG f64:$src, sub0)),
 734     sub0,
 735     (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
 736                   (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
 737     sub1)
 738 >;
 739
 740 def : Pat <
 741   (fabs f32:$src),
 742   (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
 743 >;
 744
 745 def : Pat <
 746   (fneg f32:$src),
 747   (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
 748 >;
 749
 750 def : Pat <
 751   (fabs f64:$src),
 752   (REG_SEQUENCE VReg_64,
 753     (i32 (EXTRACT_SUBREG f64:$src, sub0)),
 754     sub0,
 755     (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
 756                    (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
 757      sub1)
 758 >;
 759
 760 def : Pat <
 761   (fneg f64:$src),
 762   (REG_SEQUENCE VReg_64,
 763     (i32 (EXTRACT_SUBREG f64:$src, sub0)),
 764     sub0,
 765     (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
 766                    (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
 767     sub1)
 768 >;
 769
 770 def : Pat <
 771   (fcopysign f16:$src0, f16:$src1),
 772   (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
 773 >;
 774
 775 def : Pat <
 776   (fcopysign f32:$src0, f16:$src1),
 777   (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0,
 778              (V_LSHLREV_B32_e64 (i32 16), $src1))
 779 >;
 780
 781 def : Pat <
 782   (fcopysign f64:$src0, f16:$src1),
 783   (REG_SEQUENCE SReg_64,
 784     (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
 785     (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
 786                (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
 787 >;
 788
 789 def : Pat <
 790   (fcopysign f16:$src0, f32:$src1),
 791   (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
 792              (V_LSHRREV_B32_e64 (i32 16), $src1))
 793 >;
 794
 795 def : Pat <
 796   (fcopysign f16:$src0, f64:$src1),
 797   (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
 798              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
 799 >;
 800
 801 def : Pat <
 802   (fneg f16:$src),
 803   (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
 804 >;
 805
 806 def : Pat <
 807   (fabs f16:$src),
 808   (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
 809 >;
 810
 811 def : Pat <
 812   (fneg (fabs f16:$src)),
 813   (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
 814 >;
 815
 816 def : Pat <
 817   (fneg v2f16:$src),
 818   (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
 819 >;
 820
 821 def : Pat <
 822   (fabs v2f16:$src),
 823   (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
 824 >;
 825
 826 // This is really (fneg (fabs v2f16:$src))
 827 //
 828 // fabs is not reported as free because there is modifier for it in
 829 // VOP3P instructions, so it is turned into the bit op.
 830 def : Pat <
 831   (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
 832   (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
 833 >;
 834
 835 /********** ================== **********/
 836 /********** Immediate Patterns **********/
 837 /********** ================== **********/
 838
 839 def : Pat <
 840   (VGPRImm<(i32 imm)>:$imm),
 841   (V_MOV_B32_e32 imm:$imm)
 842 >;
 843
 844 def : Pat <
 845   (VGPRImm<(f32 fpimm)>:$imm),
 846   (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
 847 >;
 848
 849 def : Pat <
 850   (i32 imm:$imm),
 851   (S_MOV_B32 imm:$imm)
 852 >;
 853
 854 // FIXME: Workaround for ordering issue with peephole optimizer where
 855 // a register class copy interferes with immediate folding.  Should
 856 // use s_mov_b32, which can be shrunk to s_movk_i32
 857 def : Pat <
 858   (VGPRImm<(f16 fpimm)>:$imm),
 859   (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
 860 >;
 861
 862 def : Pat <
 863   (f32 fpimm:$imm),
 864   (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
 865 >;
 866
 867 def : Pat <
 868   (f16 fpimm:$imm),
 869   (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
 870 >;
 871
 872 def : Pat <
 873  (i32 frameindex:$fi),
 874  (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
 875 >;
 876
 877 def : Pat <
 878   (i64 InlineImm<i64>:$imm),
 879   (S_MOV_B64 InlineImm<i64>:$imm)
 880 >;
 881
 882 // XXX - Should this use a s_cmp to set SCC?
 883
 884 // Set to sign-extended 64-bit value (true = -1, false = 0)
 885 def : Pat <
 886   (i1 imm:$imm),
 887   (S_MOV_B64 (i64 (as_i64imm $imm)))
 888 >;
 889
 890 def : Pat <
 891   (f64 InlineFPImm<f64>:$imm),
 892   (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
 893 >;
 894
 895 /********** ================== **********/
 896 /********** Intrinsic Patterns **********/
 897 /********** ================== **********/
 898
 899 def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
 900
 901 def : Pat <
 902   (i32 (sext i1:$src0)),
 903   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
 904 >;
 905
 906 class Ext32Pat <SDNode ext> : Pat <
 907   (i32 (ext i1:$src0)),
 908   (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
 909 >;
 910
 911 def : Ext32Pat <zext>;
 912 def : Ext32Pat <anyext>;
 913
 914 // The multiplication scales from [0,1] to the unsigned integer range
 915 def : Pat <
 916   (AMDGPUurecip i32:$src0),
 917   (V_CVT_U32_F32_e32
 918     (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
 919                    (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
 920 >;
 921
 922 //===----------------------------------------------------------------------===//
 923 // VOP3 Patterns
 924 //===----------------------------------------------------------------------===//
 925
 926 def : IMad24Pat<V_MAD_I32_I24>;
 927 def : UMad24Pat<V_MAD_U32_U24>;
 928
 929 defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 930 def : ROTRPattern <V_ALIGNBIT_B32>;
 931
 932 def : Pat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
 933           (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
 934                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
 935
 936 def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
 937           (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
 938                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
 939
 940 /********** ====================== **********/
 941 /**********   Indirect addressing  **********/
 942 /********** ====================== **********/
 943
 944 multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
 945   // Extract with offset
 946   def : Pat<
 947     (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
 948     (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
 949   >;
 950
 951   // Insert with offset
 952   def : Pat<
 953     (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
 954     (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
 955   >;
 956 }
 957
 958 defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
 959 defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
 960 defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
 961 defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
 962
 963 defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
 964 defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
 965 defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
 966 defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
 967
 968 //===----------------------------------------------------------------------===//
 969 // SAD Patterns
 970 //===----------------------------------------------------------------------===//
 971
 972 def : Pat <
 973   (add (sub_oneuse (umax i32:$src0, i32:$src1),
 974                    (umin i32:$src0, i32:$src1)),
 975        i32:$src2),
 976   (V_SAD_U32 $src0, $src1, $src2)
 977 >;
 978
 979 def : Pat <
 980   (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
 981                       (sub i32:$src0, i32:$src1),
 982                       (sub i32:$src1, i32:$src0)),
 983        i32:$src2),
 984   (V_SAD_U32 $src0, $src1, $src2)
 985 >;
 986
 987 //===----------------------------------------------------------------------===//
 988 // Conversion Patterns
 989 //===----------------------------------------------------------------------===//
 990
 991 def : Pat<(i32 (sext_inreg i32:$src, i1)),
 992   (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
 993
 994 // Handle sext_inreg in i64
 995 def : Pat <
 996   (i64 (sext_inreg i64:$src, i1)),
 997   (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
 998 >;
 999
1000 def : Pat <
1001   (i16 (sext_inreg i16:$src, i1)),
1002   (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
1003 >;
1004
1005 def : Pat <
1006   (i16 (sext_inreg i16:$src, i8)),
1007   (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
1008 >;
1009
1010 def : Pat <
1011   (i64 (sext_inreg i64:$src, i8)),
1012   (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
1013 >;
1014
1015 def : Pat <
1016   (i64 (sext_inreg i64:$src, i16)),
1017   (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
1018 >;
1019
1020 def : Pat <
1021   (i64 (sext_inreg i64:$src, i32)),
1022   (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
1023 >;
1024
1025 def : Pat <
1026   (i64 (zext i32:$src)),
1027   (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
1028 >;
1029
1030 def : Pat <
1031   (i64 (anyext i32:$src)),
1032   (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
1033 >;
1034
1035 class ZExt_i64_i1_Pat <SDNode ext> : Pat <
1036   (i64 (ext i1:$src)),
1037     (REG_SEQUENCE VReg_64,
1038       (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
1039       (S_MOV_B32 (i32 0)), sub1)
1040 >;
1041
1042
1043 def : ZExt_i64_i1_Pat<zext>;
1044 def : ZExt_i64_i1_Pat<anyext>;
1045
1046 // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
1047 // REG_SEQUENCE patterns don't support instructions with multiple outputs.
1048 def : Pat <
1049   (i64 (sext i32:$src)),
1050     (REG_SEQUENCE SReg_64, $src, sub0,
1051     (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
1052 >;
1053
1054 def : Pat <
1055   (i64 (sext i1:$src)),
1056   (REG_SEQUENCE VReg_64,
1057     (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
1058     (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
1059 >;
1060
1061 class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
1062   (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
1063   (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
1064 >;
1065
1066 def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
1067 def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
1068 def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
1069 def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
1070
1071 // If we need to perform a logical operation on i1 values, we need to
1072 // use vector comparisons since there is only one SCC register. Vector
1073 // comparisons still write to a pair of SGPRs, so treat these as
1074 // 64-bit comparisons. When legalizing SGPR copies, instructions
1075 // resulting in the copies from SCC to these instructions will be
1076 // moved to the VALU.
1077 def : Pat <
1078   (i1 (and i1:$src0, i1:$src1)),
1079   (S_AND_B64 $src0, $src1)
1080 >;
1081
1082 def : Pat <
1083   (i1 (or i1:$src0, i1:$src1)),
1084   (S_OR_B64 $src0, $src1)
1085 >;
1086
1087 def : Pat <
1088   (i1 (xor i1:$src0, i1:$src1)),
1089   (S_XOR_B64 $src0, $src1)
1090 >;
1091
1092 def : Pat <
1093   (f32 (sint_to_fp i1:$src)),
1094   (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
1095 >;
1096
1097 def : Pat <
1098   (f32 (uint_to_fp i1:$src)),
1099   (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
1100 >;
1101
1102 def : Pat <
1103   (f64 (sint_to_fp i1:$src)),
1104   (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
1105 >;
1106
1107 def : Pat <
1108   (f64 (uint_to_fp i1:$src)),
1109   (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
1110 >;
1111
1112 //===----------------------------------------------------------------------===//
1113 // Miscellaneous Patterns
1114 //===----------------------------------------------------------------------===//
1115 def : Pat <
1116   (i32 (AMDGPUfp16_zext f16:$src)),
1117   (COPY $src)
1118 >;
1119
1120
1121 def : Pat <
1122   (i32 (trunc i64:$a)),
1123   (EXTRACT_SUBREG $a, sub0)
1124 >;
1125
1126 def : Pat <
1127   (i1 (trunc i32:$a)),
1128   (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
1129 >;
1130
1131 def : Pat <
1132   (i1 (trunc i16:$a)),
1133   (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
1134 >;
1135
1136 def : Pat <
1137   (i1 (trunc i64:$a)),
1138   (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
1139                     (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
1140 >;
1141
1142 def : Pat <
1143   (i32 (bswap i32:$a)),
1144   (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
1145              (V_ALIGNBIT_B32 $a, $a, (i32 24)),
1146              (V_ALIGNBIT_B32 $a, $a, (i32 8)))
1147 >;
1148
1149 multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
1150   def : Pat <
1151     (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
1152     (BFM $a, $b)
1153   >;
1154
1155   def : Pat <
1156     (vt (add (vt (shl 1, vt:$a)), -1)),
1157     (BFM $a, (MOV (i32 0)))
1158   >;
1159 }
1160
1161 defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
1162 // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
1163 defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
1164
1165 def : Pat<
1166   (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
1167   (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
1168 >;
1169
1170 def : Pat<
1171   (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
1172   (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
1173 >;
1174
1175 def : Pat<
1176   (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
1177   (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
1178 >;
1179
1180 def : Pat<
1181   (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
1182   (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
1183 >;
1184
1185
1186 // Allow integer inputs
1187 class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat<
1188   (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
1189   (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en)
1190 >;
1191
1192 def : ExpPattern<AMDGPUexport, i32, EXP>;
1193 def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
1194
1195 def : Pat <
1196   (v2i16 (build_vector i16:$src0, i16:$src1)),
1197   (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
1198 >;
1199
1200 // With multiple uses of the shift, this will duplicate the shift and
1201 // increase register pressure.
1202 def : Pat <
1203   (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
1204   (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
1205 >;
1206
1207 def : Pat <
1208   (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
1209                        (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
1210   (v2i16 (S_PACK_HH_B32_B16 $src0, $src1))
1211 >;
1212
1213 // TODO: Should source modifiers be matched to v_pack_b32_f16?
1214 def : Pat <
1215   (v2f16 (build_vector f16:$src0, f16:$src1)),
1216   (v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
1217 >;
1218
1219 // def : Pat <
1220 //   (v2f16 (scalar_to_vector f16:$src0)),
1221 //   (COPY $src0)
1222 // >;
1223
1224 // def : Pat <
1225 //   (v2i16 (scalar_to_vector i16:$src0)),
1226 //   (COPY $src0)
1227 // >;
1228
1229 //===----------------------------------------------------------------------===//
1230 // Fract Patterns
1231 //===----------------------------------------------------------------------===//
1232
1233 let Predicates = [isSI] in {
1234
1235 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
1236 // used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
1237 // way to implement it is using V_FRACT_F64.
1238 // The workaround for the V_FRACT bug is:
1239 //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
1240
1241 // Convert floor(x) to (x - fract(x))
1242 def : Pat <
1243   (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
1244   (V_ADD_F64
1245       $mods,
1246       $x,
1247       SRCMODS.NEG,
1248       (V_CNDMASK_B64_PSEUDO
1249          (V_MIN_F64
1250              SRCMODS.NONE,
1251              (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
1252              SRCMODS.NONE,
1253              (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
1254              DSTCLAMP.NONE, DSTOMOD.NONE),
1255          $x,
1256          (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
1257       DSTCLAMP.NONE, DSTOMOD.NONE)
1258 >;
1259
1260 } // End Predicates = [isSI]
1261
1262 //============================================================================//
1263 // Miscellaneous Optimization Patterns
1264 //============================================================================//
1265
1266 // Undo sub x, c -> add x, -c canonicalization since c is more likely
1267 // an inline immediate than -c.
1268 // TODO: Also do for 64-bit.
1269 def : Pat<
1270   (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
1271   (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
1272 >;
1273
1274 def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
1275
1276 def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
1277 def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
1278
1279 // This matches 16 permutations of
1280 // max(min(x, y), min(max(x, y), z))
1281 class FPMed3Pat<ValueType vt,
1282                 Instruction med3Inst> : Pat<
1283   (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
1284                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
1285            (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
1286                                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
1287                            (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
1288   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
1289 >;
1290
1291 def : FPMed3Pat<f32, V_MED3_F32>;
1292
1293 let Predicates = [isGFX9] in {
1294 def : FPMed3Pat<f16, V_MED3_F16>;
1295 def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
1296 def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
1297 } // End Predicates = [isGFX9]
1298
1299 //============================================================================//
1300 // Assembler aliases
1301 //============================================================================//
1302
1303 def : MnemonicAlias<"v_add_u32", "v_add_i32">;
1304 def : MnemonicAlias<"v_sub_u32", "v_sub_i32">;
1305 def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">;
1306
1307 } // End isGCN predicate