contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td

   1 //===-- SIInstructions.td - SI Instruction Definitions --------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 // This file was originally auto-generated from a GPU register header file and
   9 // all the instruction definitions were originally commented out.  Instructions
  10 // that are not yet supported remain commented out.
  11 //===----------------------------------------------------------------------===//
  12
  13 class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl {
  14
  15 }
  16
  17 class UniformSextInreg<ValueType VT> : PatFrag<
  18   (ops node:$src),
  19   (sext_inreg $src, VT),
  20   [{ return !N->isDivergent(); }]>;
  21
  22 class DivergentSextInreg<ValueType VT> : PatFrag<
  23   (ops node:$src),
  24   (sext_inreg $src, VT),
  25   [{ return N->isDivergent(); }]>;
  26
  27 include "SOPInstructions.td"
  28 include "VOPInstructions.td"
  29 include "SMInstructions.td"
  30 include "FLATInstructions.td"
  31 include "BUFInstructions.td"
  32 include "EXPInstructions.td"
  33 include "LDSDIRInstructions.td"
  34 include "VINTERPInstructions.td"
  35
  36 //===----------------------------------------------------------------------===//
  37 // VINTRP Instructions
  38 //===----------------------------------------------------------------------===//
  39
  40 // Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
  41 def VINTRPDst : VINTRPDstOperand <VGPR_32>;
  42
  43 let Uses = [MODE, M0, EXEC] in {
  44
  45 // FIXME: Specify SchedRW for VINTRP instructions.
  46
  47 multiclass V_INTERP_P1_F32_m : VINTRP_m <
  48   0x00000000,
  49   (outs VINTRPDst:$vdst),
  50   (ins VGPR_32:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan),
  51   "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
  52   [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc,
  53                    (i32 timm:$attrchan), (i32 timm:$attr), M0))]
  54 >;
  55
  56 let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in {
  57
  58 defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
  59
  60 } // End OtherPredicates = [has32BankLDS, isNotGFX90APlus]
  61
  62 let OtherPredicates = [has16BankLDS, isNotGFX90APlus],
  63     Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
  64
  65 defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
  66
  67 } // End OtherPredicates = [has32BankLDS, isNotGFX90APlus],
  68   //     Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
  69
  70 let OtherPredicates = [isNotGFX90APlus] in {
  71 let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
  72
  73 defm V_INTERP_P2_F32 : VINTRP_m <
  74   0x00000001,
  75   (outs VINTRPDst:$vdst),
  76   (ins VGPR_32:$src0, VGPR_32:$vsrc, InterpAttr:$attr,
  77        InterpAttrChan:$attrchan),
  78   "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
  79   [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
  80                    (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
  81
  82 } // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
  83
  84 defm V_INTERP_MOV_F32 : VINTRP_m <
  85   0x00000002,
  86   (outs VINTRPDst:$vdst),
  87   (ins InterpSlot:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan),
  88   "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
  89   [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
  90                    (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
  91
  92 } // End OtherPredicates = [isNotGFX90APlus]
  93
  94 } // End Uses = [MODE, M0, EXEC]
  95
  96 //===----------------------------------------------------------------------===//
  97 // Pseudo Instructions
  98 //===----------------------------------------------------------------------===//
  99
 100 // Insert a branch to an endpgm block to use as a fallback trap.
 101 def ENDPGM_TRAP : SPseudoInstSI<
 102   (outs), (ins),
 103   [(AMDGPUendpgm_trap)],
 104   "ENDPGM_TRAP"> {
 105   let hasSideEffects = 1;
 106   let usesCustomInserter = 1;
 107 }
 108
 109 def ATOMIC_FENCE : SPseudoInstSI<
 110   (outs), (ins i32imm:$ordering, i32imm:$scope),
 111   [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
 112   "ATOMIC_FENCE $ordering, $scope"> {
 113   let hasSideEffects = 1;
 114   let maybeAtomic = 1;
 115 }
 116
 117 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 118
 119 // For use in patterns
 120 def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
 121   (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
 122   let isPseudo = 1;
 123   let isCodeGenOnly = 1;
 124   let usesCustomInserter = 1;
 125 }
 126
 127 // 64-bit vector move instruction. This is mainly used by the
 128 // SIFoldOperands pass to enable folding of inline immediates.
 129 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
 130                                       (ins VSrc_b64:$src0)> {
 131   let isReMaterializable = 1;
 132   let isAsCheapAsAMove = 1;
 133   let isMoveImm = 1;
 134   let SchedRW = [Write64Bit];
 135   let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each.
 136   let UseNamedOperandTable = 1;
 137 }
 138
 139 // 64-bit vector move with dpp. Expanded post-RA.
 140 def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> {
 141   let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
 142 }
 143
 144 // 64-bit scalar move immediate instruction. This is used to avoid subregs
 145 // initialization and allow rematerialization.
 146 def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
 147                                           (ins i64imm:$src0)> {
 148   let isReMaterializable = 1;
 149   let isAsCheapAsAMove = 1;
 150   let isMoveImm = 1;
 151   let SchedRW = [WriteSALU, Write64Bit];
 152   let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
 153   let Uses = [];
 154 }
 155
 156 // Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
 157 // WQM pass processes it.
 158 def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 159
 160 // Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is
 161 // turned into a copy by WQM pass, but does not seed WQM requirements.
 162 def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 163
 164 // Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so
 165 // that the @earlyclobber is respected. The @earlyclobber is to make sure that
 166 // the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't
 167 // accidentally clobber inactive channels of $vdst.
 168 let Constraints = "@earlyclobber $vdst" in {
 169 def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 170 def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 171 }
 172
 173 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 174
 175 def WWM_COPY : SPseudoInstSI <
 176   (outs unknown:$dst), (ins unknown:$src)> {
 177   let hasSideEffects = 0;
 178   let isAsCheapAsAMove = 1;
 179   let isConvergent = 1;
 180 }
 181
 182 def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
 183   let Uses = [EXEC];
 184   let Defs = [EXEC, SCC];
 185   let hasSideEffects = 0;
 186   let mayLoad = 0;
 187   let mayStore = 0;
 188 }
 189
 190 def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
 191   let hasSideEffects = 0;
 192   let mayLoad = 0;
 193   let mayStore = 0;
 194 }
 195
 196 def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
 197   let Uses = [EXEC];
 198   let Defs = [EXEC, SCC];
 199   let hasSideEffects = 0;
 200   let mayLoad = 0;
 201   let mayStore = 0;
 202 }
 203
 204 def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
 205   let hasSideEffects = 0;
 206   let mayLoad = 0;
 207   let mayStore = 0;
 208 }
 209
 210 let usesCustomInserter = 1 in {
 211 def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$mask)>;
 212
 213 def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
 214 } // End usesCustomInserter = 1
 215
 216 // PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
 217 def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
 218   let Uses = [EXEC];
 219   let Defs = [EXEC];
 220   let hasSideEffects = 0;
 221   let mayLoad = 0;
 222   let mayStore = 0;
 223 }
 224
 225 def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
 226   let hasSideEffects = 0;
 227   let mayLoad = 0;
 228   let mayStore = 0;
 229 }
 230
 231 // Pseudo instructions used for @llvm.fptrunc.round upward
 232 // and @llvm.fptrunc.round downward.
 233 // These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
 234 // and G_FPTRUNC_ROUND_DOWNWARD before being lowered to
 235 // FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO.
 236 // The final codegen is done in the ModeRegister pass.
 237 let Uses = [MODE, EXEC] in {
 238 def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
 239   (ins VGPR_32:$src0),
 240   [(set f16:$vdst, (SIfptrunc_round_upward f32:$src0))]>;
 241
 242 def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
 243   (ins VGPR_32:$src0),
 244   [(set f16:$vdst, (SIfptrunc_round_downward f32:$src0))]>;
 245 } // End Uses = [MODE, EXEC]
 246
 247 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
 248 // restoring it after we're done.
 249 let Defs = [SCC], isConvergent = 1 in {
 250 def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
 251   (ins VSrc_b32: $src, VSrc_b32:$inactive),
 252   [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
 253 }
 254
 255 def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
 256   (ins VSrc_b64: $src, VSrc_b64:$inactive),
 257   [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
 258 }
 259 } // End Defs = [SCC]
 260
 261 let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 262   def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
 263     (ins VSrc_b32: $src, VSrc_b32:$strategy),
 264     [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
 265   }
 266
 267   def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
 268     (ins VSrc_b32: $src, VSrc_b32:$strategy),
 269     [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
 270   }
 271 }
 272
 273 let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
 274 def V_ADD_U64_PSEUDO : VPseudoInstSI <
 275   (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
 276   [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))]
 277 >;
 278
 279 def V_SUB_U64_PSEUDO : VPseudoInstSI <
 280   (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
 281   [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))]
 282 >;
 283 } // End usesCustomInserter = 1, Defs = [VCC, EXEC]
 284
 285 let usesCustomInserter = 1, Defs = [SCC] in {
 286 def S_ADD_U64_PSEUDO : SPseudoInstSI <
 287   (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
 288   [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))]
 289 >;
 290
 291 def S_SUB_U64_PSEUDO : SPseudoInstSI <
 292   (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
 293   [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))]
 294 >;
 295
 296 def S_ADD_CO_PSEUDO : SPseudoInstSI <
 297   (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
 298 >;
 299
 300 def S_SUB_CO_PSEUDO : SPseudoInstSI <
 301   (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
 302 >;
 303
 304 def S_UADDO_PSEUDO : SPseudoInstSI <
 305   (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
 306 >;
 307
 308 def S_USUBO_PSEUDO : SPseudoInstSI <
 309   (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
 310 >;
 311
 312 } // End usesCustomInserter = 1, Defs = [SCC]
 313
 314 let usesCustomInserter = 1 in {
 315 def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins),
 316   [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
 317 } // End let usesCustomInserter = 1, SALU = 1
 318
 319 // Wrap an instruction by duplicating it, except for setting isTerminator.
 320 class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
 321       base_inst.OutOperandList,
 322       base_inst.InOperandList> {
 323   let Uses = base_inst.Uses;
 324   let Defs = base_inst.Defs;
 325   let isTerminator = 1;
 326   let isAsCheapAsAMove = base_inst.isAsCheapAsAMove;
 327   let hasSideEffects = base_inst.hasSideEffects;
 328   let UseNamedOperandTable = base_inst.UseNamedOperandTable;
 329   let CodeSize = base_inst.CodeSize;
 330   let SchedRW = base_inst.SchedRW;
 331 }
 332
 333 let WaveSizePredicate = isWave64 in {
 334 def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
 335 def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
 336 def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
 337 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
 338 def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
 339 def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>;
 340 }
 341
 342 let WaveSizePredicate = isWave32 in {
 343 def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
 344 def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
 345 def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
 346 def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
 347 def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
 348 def S_AND_SAVEEXEC_B32_term : WrapTerminatorInst<S_AND_SAVEEXEC_B32>;
 349 }
 350
 351
 352 def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
 353   [(int_amdgcn_wave_barrier)]> {
 354   let SchedRW = [];
 355   let hasNoSchedulingInfo = 1;
 356   let hasSideEffects = 1;
 357   let mayLoad = 0;
 358   let mayStore = 0;
 359   let isConvergent = 1;
 360   let FixedSize = 1;
 361   let Size = 0;
 362   let isMeta = 1;
 363 }
 364
 365 def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask),
 366   [(int_amdgcn_sched_barrier (i32 timm:$mask))]> {
 367   let SchedRW = [];
 368   let hasNoSchedulingInfo = 1;
 369   let hasSideEffects = 1;
 370   let mayLoad = 0;
 371   let mayStore = 0;
 372   let isConvergent = 1;
 373   let FixedSize = 1;
 374   let Size = 0;
 375   let isMeta = 1;
 376 }
 377
 378 def SCHED_GROUP_BARRIER : SPseudoInstSI<
 379   (outs),
 380   (ins i32imm:$mask, i32imm:$size, i32imm:$syncid),
 381   [(int_amdgcn_sched_group_barrier (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid))]> {
 382   let SchedRW = [];
 383   let hasNoSchedulingInfo = 1;
 384   let hasSideEffects = 1;
 385   let mayLoad = 0;
 386   let mayStore = 0;
 387   let isConvergent = 1;
 388   let FixedSize = 1;
 389   let Size = 0;
 390   let isMeta = 1;
 391 }
 392
 393 def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask),
 394   [(int_amdgcn_iglp_opt (i32 timm:$mask))]> {
 395   let SchedRW = [];
 396   let hasNoSchedulingInfo = 1;
 397   let hasSideEffects = 1;
 398   let mayLoad = 0;
 399   let mayStore = 0;
 400   let isConvergent = 1;
 401   let FixedSize = 1;
 402   let Size = 0;
 403   let isMeta = 1;
 404 }
 405
 406 // SI pseudo instructions. These are used by the CFG structurizer pass
 407 // and should be lowered to ISA instructions prior to codegen.
 408
 409 // As we have enhanced control flow intrinsics to work under unstructured CFG,
 410 // duplicating such intrinsics can be actually treated as legal. On the contrary,
 411 // by making them non-duplicable, we are observing better code generation result.
 412 // So we choose to mark them non-duplicable in hope of getting better code
 413 // generation as well as simplied CFG during Machine IR optimization stage.
 414
 415 let isTerminator = 1, isNotDuplicable = 1 in {
 416
 417 let OtherPredicates = [EnableLateCFGStructurize] in {
 418  def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
 419   (outs),
 420   (ins SReg_1:$vcc, brtarget:$target),
 421   [(brcond i1:$vcc, bb:$target)]> {
 422     let Size = 12;
 423 }
 424 }
 425
 426 def SI_IF: CFPseudoInstSI <
 427   (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
 428   [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
 429   let Constraints = "";
 430   let Size = 12;
 431   let hasSideEffects = 1;
 432   let IsNeverUniform = 1;
 433 }
 434
 435 def SI_ELSE : CFPseudoInstSI <
 436   (outs SReg_1:$dst),
 437   (ins SReg_1:$src, brtarget:$target), [], 1, 1> {
 438   let Size = 12;
 439   let hasSideEffects = 1;
 440   let IsNeverUniform = 1;
 441 }
 442
 443 def SI_WATERFALL_LOOP : CFPseudoInstSI <
 444   (outs),
 445   (ins brtarget:$target), [], 1> {
 446   let Size = 8;
 447   let isBranch = 1;
 448   let Defs = [];
 449 }
 450
 451 def SI_LOOP : CFPseudoInstSI <
 452   (outs), (ins SReg_1:$saved, brtarget:$target),
 453   [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
 454   let Size = 8;
 455   let isBranch = 1;
 456   let hasSideEffects = 1;
 457   let IsNeverUniform = 1;
 458 }
 459
 460 } // End isTerminator = 1
 461
 462 def SI_END_CF : CFPseudoInstSI <
 463   (outs), (ins SReg_1:$saved), [], 1, 1> {
 464   let Size = 4;
 465   let isAsCheapAsAMove = 1;
 466   let isReMaterializable = 1;
 467   let hasSideEffects = 1;
 468   let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details.
 469   let mayLoad = 1; // FIXME: Should not need memory flags
 470   let mayStore = 1;
 471 }
 472
 473 def SI_IF_BREAK : CFPseudoInstSI <
 474   (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
 475   let Size = 4;
 476   let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details.
 477   let isAsCheapAsAMove = 1;
 478   let isReMaterializable = 1;
 479 }
 480
 481 // Branch to the early termination block of the shader if SCC is 0.
 482 // This uses SCC from a previous SALU operation, i.e. the update of
 483 // a mask of live lanes after a kill/demote operation.
 484 // Only valid in pixel shaders.
 485 def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> {
 486   let Uses = [EXEC,SCC];
 487 }
 488
 489 let Uses = [EXEC] in {
 490
 491 multiclass PseudoInstKill <dag ins> {
 492   // Even though this pseudo can usually be expanded without an SCC def, we
 493   // conservatively assume that it has an SCC def, both because it is sometimes
 494   // required in degenerate cases (when V_CMPX cannot be used due to constant
 495   // bus limitations) and because it allows us to avoid having to track SCC
 496   // liveness across basic blocks.
 497   let Defs = [EXEC,SCC] in
 498   def _PSEUDO : PseudoInstSI <(outs), ins> {
 499     let isConvergent = 1;
 500     let usesCustomInserter = 1;
 501   }
 502
 503   let Defs = [EXEC,SCC] in
 504   def _TERMINATOR : SPseudoInstSI <(outs), ins> {
 505     let isTerminator = 1;
 506   }
 507 }
 508
 509 defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
 510 let Defs = [VCC] in
 511 defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
 512
 513 let Defs = [EXEC,VCC] in
 514 def SI_ILLEGAL_COPY : SPseudoInstSI <
 515   (outs unknown:$dst), (ins unknown:$src),
 516   [], " ; illegal copy $src to $dst">;
 517
 518 } // End Uses = [EXEC], Defs = [EXEC,VCC]
 519
 520 // Branch on undef scc. Used to avoid intermediate copy from
 521 // IMPLICIT_DEF to SCC.
 522 def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins SOPPBrTarget:$simm16)> {
 523   let isTerminator = 1;
 524   let usesCustomInserter = 1;
 525   let isBranch = 1;
 526 }
 527
 528 def SI_PS_LIVE : PseudoInstSI <
 529   (outs SReg_1:$dst), (ins),
 530   [(set i1:$dst, (int_amdgcn_ps_live))]> {
 531   let SALU = 1;
 532 }
 533
 534 let Uses = [EXEC] in {
 535 def SI_LIVE_MASK : PseudoInstSI <
 536   (outs SReg_1:$dst), (ins),
 537   [(set i1:$dst, (int_amdgcn_live_mask))]> {
 538   let SALU = 1;
 539 }
 540 let Defs = [EXEC,SCC] in {
 541 // Demote: Turn a pixel shader thread into a helper lane.
 542 def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>;
 543 } // End Defs = [EXEC,SCC]
 544 } // End Uses = [EXEC]
 545
 546 def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
 547   [(int_amdgcn_unreachable)],
 548   "; divergent unreachable"> {
 549   let Size = 0;
 550   let hasNoSchedulingInfo = 1;
 551   let FixedSize = 1;
 552   let isMeta = 1;
 553 }
 554
 555 // Used as an isel pseudo to directly emit initialization with an
 556 // s_mov_b32 rather than a copy of another initialized
 557 // register. MachineCSE skips copies, and we don't want to have to
 558 // fold operands before it runs.
 559 def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
 560   let Defs = [M0];
 561   let usesCustomInserter = 1;
 562   let isAsCheapAsAMove = 1;
 563   let isReMaterializable = 1;
 564 }
 565
 566 def SI_INIT_EXEC : SPseudoInstSI <
 567   (outs), (ins i64imm:$src),
 568   [(int_amdgcn_init_exec (i64 timm:$src))]> {
 569   let Defs = [EXEC];
 570   let isAsCheapAsAMove = 1;
 571 }
 572
 573 def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
 574   (outs), (ins SSrc_b32:$input, i32imm:$shift),
 575   [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
 576   let Defs = [EXEC];
 577 }
 578
 579 // Return for returning shaders to a shader variant epilog.
 580 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
 581   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
 582   let isTerminator = 1;
 583   let isBarrier = 1;
 584   let isReturn = 1;
 585   let hasNoSchedulingInfo = 1;
 586   let DisableWQM = 1;
 587   let FixedSize = 1;
 588
 589   // TODO: Should this be true?
 590   let isMeta = 0;
 591 }
 592
 593 // Return for returning function calls.
 594 def SI_RETURN : SPseudoInstSI <
 595   (outs), (ins), [(AMDGPUret_glue)],
 596   "; return"> {
 597   let isTerminator = 1;
 598   let isBarrier = 1;
 599   let isReturn = 1;
 600   let SchedRW = [WriteBranch];
 601 }
 602
 603 // Return for returning function calls without output register.
 604 //
 605 // This version is only needed so we can fill in the output register
 606 // in the custom inserter.
 607 def SI_CALL_ISEL : SPseudoInstSI <
 608   (outs), (ins SSrc_b64:$src0, unknown:$callee),
 609   [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> {
 610   let Size = 4;
 611   let isCall = 1;
 612   let SchedRW = [WriteBranch];
 613   let usesCustomInserter = 1;
 614   // TODO: Should really base this on the call target
 615   let isConvergent = 1;
 616 }
 617
 618 def : GCNPat<
 619   (AMDGPUcall i64:$src0, (i64 0)),
 620   (SI_CALL_ISEL $src0, (i64 0))
 621 >;
 622
 623 // Wrapper around s_swappc_b64 with extra $callee parameter to track
 624 // the called function after regalloc.
 625 def SI_CALL : SPseudoInstSI <
 626   (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
 627   let Size = 4;
 628   let FixedSize = 1;
 629   let isCall = 1;
 630   let UseNamedOperandTable = 1;
 631   let SchedRW = [WriteBranch];
 632   // TODO: Should really base this on the call target
 633   let isConvergent = 1;
 634 }
 635
 636 class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
 637   (ins rc:$src0, unknown:$callee, i32imm:$fpdiff),
 638   [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
 639   let Size = 4;
 640   let FixedSize = 1;
 641   let isCall = 1;
 642   let isTerminator = 1;
 643   let isReturn = 1;
 644   let isBarrier = 1;
 645   let UseNamedOperandTable = 1;
 646   let SchedRW = [WriteBranch];
 647   // TODO: Should really base this on the call target
 648   let isConvergent = 1;
 649 }
 650
 651 // Tail call handling pseudo
 652 def SI_TCRETURN :     SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>;
 653 def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>;
 654
 655 // Handle selecting indirect tail calls
 656 def : GCNPat<
 657   (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)),
 658   (SI_TCRETURN CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff)
 659 >;
 660
 661 // Handle selecting indirect tail calls for AMDGPU_gfx
 662 def : GCNPat<
 663   (AMDGPUtc_return_gfx i64:$src0, (i64 0), (i32 timm:$fpdiff)),
 664   (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff)
 665 >;
 666
 667 def ADJCALLSTACKUP : SPseudoInstSI<
 668   (outs), (ins i32imm:$amt0, i32imm:$amt1),
 669   [(callseq_start timm:$amt0, timm:$amt1)],
 670   "; adjcallstackup $amt0 $amt1"> {
 671   let Size = 8; // Worst case. (s_add_u32 + constant)
 672   let FixedSize = 1;
 673   let hasSideEffects = 1;
 674   let usesCustomInserter = 1;
 675   let SchedRW = [WriteSALU];
 676   let Defs = [SCC];
 677 }
 678
 679 def ADJCALLSTACKDOWN : SPseudoInstSI<
 680   (outs), (ins i32imm:$amt1, i32imm:$amt2),
 681   [(callseq_end timm:$amt1, timm:$amt2)],
 682   "; adjcallstackdown $amt1"> {
 683   let Size = 8; // Worst case. (s_add_u32 + constant)
 684   let hasSideEffects = 1;
 685   let usesCustomInserter = 1;
 686   let SchedRW = [WriteSALU];
 687   let Defs = [SCC];
 688 }
 689
 690 let Defs = [M0, EXEC, SCC],
 691   UseNamedOperandTable = 1 in {
 692
 693 // SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect
 694 // addressing implementation.
 695 class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
 696   (outs VGPR_32:$vdst),
 697   (ins rc:$src, VS_32:$idx, i32imm:$offset)> {
 698   let usesCustomInserter = 1;
 699 }
 700
 701 class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
 702   (outs rc:$vdst),
 703   (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
 704   let Constraints = "$src = $vdst";
 705   let usesCustomInserter = 1;
 706 }
 707
 708 def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
 709 def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
 710 def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
 711 def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
 712 def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>;
 713 def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>;
 714 def SI_INDIRECT_SRC_V11 : SI_INDIRECT_SRC<VReg_352>;
 715 def SI_INDIRECT_SRC_V12 : SI_INDIRECT_SRC<VReg_384>;
 716 def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
 717 def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
 718
 719 def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
 720 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 721 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 722 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
 723 def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>;
 724 def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>;
 725 def SI_INDIRECT_DST_V11 : SI_INDIRECT_DST<VReg_352>;
 726 def SI_INDIRECT_DST_V12 : SI_INDIRECT_DST<VReg_384>;
 727 def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 728 def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
 729
 730 } // End Uses = [EXEC], Defs = [M0, EXEC]
 731
 732 // This is a pseudo variant of the v_movreld_b32 instruction in which the
 733 // vector operand appears only twice, once as def and once as use. Using this
 734 // pseudo avoids problems with the Two Address instructions pass.
 735 class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
 736                                 RegisterOperand val_ty> : PseudoInstSI <
 737   (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> {
 738   let Constraints = "$vsrc = $vdst";
 739   let Uses = [M0];
 740 }
 741
 742 class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
 743   INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> {
 744   let VALU = 1;
 745   let VOP1 = 1;
 746   let Uses = [M0, EXEC];
 747 }
 748
 749 class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc,
 750                                   RegisterOperand val_ty> :
 751   INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> {
 752   let SALU = 1;
 753   let SOP1 = 1;
 754   let Uses = [M0];
 755 }
 756
 757 class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> :
 758   S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>;
 759 class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> :
 760   S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>;
 761
 762 def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>;
 763 def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>;
 764 def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>;
 765 def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>;
 766 def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>;
 767 def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>;
 768 def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>;
 769 def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>;
 770 def V_INDIRECT_REG_WRITE_MOVREL_B32_V11 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_352>;
 771 def V_INDIRECT_REG_WRITE_MOVREL_B32_V12 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_384>;
 772 def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>;
 773 def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>;
 774
 775 def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>;
 776 def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>;
 777 def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>;
 778 def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>;
 779 def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>;
 780 def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>;
 781 def S_INDIRECT_REG_WRITE_MOVREL_B32_V9 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_288>;
 782 def S_INDIRECT_REG_WRITE_MOVREL_B32_V10 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_320>;
 783 def S_INDIRECT_REG_WRITE_MOVREL_B32_V11 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_352>;
 784 def S_INDIRECT_REG_WRITE_MOVREL_B32_V12 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_384>;
 785 def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>;
 786 def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>;
 787
 788 def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>;
 789 def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>;
 790 def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>;
 791 def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>;
 792 def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>;
 793
 794 // These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these
 795 // pseudos we avoid spills or copies being inserted within indirect sequences
 796 // that switch the VGPR indexing mode. Spills to accvgprs could be effected by
 797 // this mode switching.
 798
 799 class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
 800   (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> {
 801   let Constraints = "$vsrc = $vdst";
 802   let VALU = 1;
 803   let Uses = [M0, EXEC];
 804   let Defs = [M0];
 805 }
 806
 807 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>;
 808 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>;
 809 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>;
 810 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>;
 811 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>;
 812 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>;
 813 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>;
 814 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>;
 815 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_352>;
 816 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_384>;
 817 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>;
 818 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>;
 819
 820 class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI <
 821   (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> {
 822   let VALU = 1;
 823   let Uses = [M0, EXEC];
 824   let Defs = [M0];
 825 }
 826
 827 def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>;
 828 def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>;
 829 def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>;
 830 def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>;
 831 def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>;
 832 def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>;
 833 def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>;
 834 def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>;
 835 def V_INDIRECT_REG_READ_GPR_IDX_B32_V11 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_352>;
 836 def V_INDIRECT_REG_READ_GPR_IDX_B32_V12 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_384>;
 837 def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>;
 838 def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>;
 839
 840 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 841   let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
 842     def _SAVE : PseudoInstSI <
 843       (outs),
 844       (ins sgpr_class:$data, i32imm:$addr)> {
 845       let mayStore = 1;
 846       let mayLoad = 0;
 847     }
 848
 849     def _RESTORE : PseudoInstSI <
 850       (outs sgpr_class:$data),
 851       (ins i32imm:$addr)> {
 852       let mayStore = 0;
 853       let mayLoad = 1;
 854     }
 855   } // End UseNamedOperandTable = 1
 856 }
 857
 858 // You cannot use M0 as the output of v_readlane_b32 instructions or
 859 // use it in the sdata operand of SMEM instructions. We still need to
 860 // be able to spill the physical register m0, so allow it for
 861 // SI_SPILL_32_* instructions.
 862 defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32>;
 863 defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
 864 defm SI_SPILL_S96  : SI_SPILL_SGPR <SReg_96>;
 865 defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
 866 defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
 867 defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
 868 defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>;
 869 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 870 defm SI_SPILL_S288 : SI_SPILL_SGPR <SReg_288>;
 871 defm SI_SPILL_S320 : SI_SPILL_SGPR <SReg_320>;
 872 defm SI_SPILL_S352 : SI_SPILL_SGPR <SReg_352>;
 873 defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>;
 874 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 875 defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
 876
 877 // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
 878 // needs to be used and an extra instruction to move between VGPR and AGPR.
 879 // UsesTmp adds to the total size of an expanded spill in this case.
 880 multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
 881   let UseNamedOperandTable = 1, VGPRSpill = 1,
 882        SchedRW = [WriteVMEM] in {
 883     def _SAVE : VPseudoInstSI <
 884       (outs),
 885       (ins vgpr_class:$vdata, i32imm:$vaddr,
 886            SReg_32:$soffset, i32imm:$offset)> {
 887       let mayStore = 1;
 888       let mayLoad = 0;
 889       // (2 * 4) + (8 * num_subregs) bytes maximum
 890       int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
 891       // Size field is unsigned char and cannot fit more.
 892       let Size = !if(!le(MaxSize, 256), MaxSize, 252);
 893     }
 894
 895     def _RESTORE : VPseudoInstSI <
 896       (outs vgpr_class:$vdata),
 897       (ins i32imm:$vaddr,
 898            SReg_32:$soffset, i32imm:$offset)> {
 899       let mayStore = 0;
 900       let mayLoad = 1;
 901
 902       // (2 * 4) + (8 * num_subregs) bytes maximum
 903       int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8);
 904       // Size field is unsigned char and cannot fit more.
 905       let Size = !if(!le(MaxSize, 256), MaxSize, 252);
 906     }
 907   } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
 908 }
 909
 910 defm SI_SPILL_V32  : SI_SPILL_VGPR <VGPR_32>;
 911 defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
 912 defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
 913 defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
 914 defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
 915 defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
 916 defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
 917 defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
 918 defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>;
 919 defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>;
 920 defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>;
 921 defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>;
 922 defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
 923 defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
 924
 925 defm SI_SPILL_A32  : SI_SPILL_VGPR <AGPR_32, 1>;
 926 defm SI_SPILL_A64  : SI_SPILL_VGPR <AReg_64, 1>;
 927 defm SI_SPILL_A96  : SI_SPILL_VGPR <AReg_96, 1>;
 928 defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
 929 defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
 930 defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
 931 defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
 932 defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
 933 defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>;
 934 defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>;
 935 defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>;
 936 defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>;
 937 defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
 938 defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
 939
 940 defm SI_SPILL_AV32  : SI_SPILL_VGPR <AV_32, 1>;
 941 defm SI_SPILL_AV64  : SI_SPILL_VGPR <AV_64, 1>;
 942 defm SI_SPILL_AV96  : SI_SPILL_VGPR <AV_96, 1>;
 943 defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>;
 944 defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>;
 945 defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>;
 946 defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>;
 947 defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>;
 948 defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>;
 949 defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>;
 950 defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>;
 951 defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>;
 952 defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
 953 defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
 954
 955 let isConvergent = 1 in
 956 defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>;
 957
 958 def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
 959   (outs SReg_64:$dst),
 960   (ins si_ga:$ptr_lo, si_ga:$ptr_hi),
 961   [(set SReg_64:$dst,
 962       (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> {
 963   let Defs = [SCC];
 964 }
 965
 966 def : GCNPat <
 967   (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0),
 968   (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
 969 >;
 970
 971 def : GCNPat<
 972   (AMDGPUtrap timm:$trapid),
 973   (S_TRAP $trapid)
 974 >;
 975
 976 def : GCNPat<
 977   (AMDGPUelse i1:$src, bb:$target),
 978   (SI_ELSE $src, $target)
 979 >;
 980
 981 def : Pat <
 982   (int_amdgcn_kill i1:$src),
 983   (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0)
 984 >;
 985
 986 def : Pat <
 987   (int_amdgcn_kill (i1 (not i1:$src))),
 988   (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1)
 989 >;
 990
 991 def : Pat <
 992   (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
 993   (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
 994 >;
 995
 996 def : Pat <
 997   (int_amdgcn_wqm_demote i1:$src),
 998   (SI_DEMOTE_I1 SCSrc_i1:$src, 0)
 999 >;
1000
1001 def : Pat <
1002   (int_amdgcn_wqm_demote (i1 (not i1:$src))),
1003   (SI_DEMOTE_I1 SCSrc_i1:$src, -1)
1004 >;
1005
1006   // TODO: we could add more variants for other types of conditionals
1007
1008 def : Pat <
1009   (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
1010   (COPY $src) // Return the SGPRs representing i1 src
1011 >;
1012
1013 def : Pat <
1014   (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
1015   (COPY $src) // Return the SGPRs representing i1 src
1016 >;
1017
1018 //===----------------------------------------------------------------------===//
1019 // VOP1 Patterns
1020 //===----------------------------------------------------------------------===//
1021
1022 multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
1023   // f16_to_fp patterns
1024   def : GCNPat <
1025     (f32 (f16_to_fp i32:$src0)),
1026     (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0)
1027   >;
1028
1029   def : GCNPat <
1030     (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
1031     (cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0)
1032   >;
1033
1034   def : GCNPat <
1035     (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
1036     (cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
1037   >;
1038
1039   def : GCNPat <
1040     (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
1041     (cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0)
1042   >;
1043
1044   def : GCNPat <
1045     (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
1046     (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
1047   >;
1048
1049   def : GCNPat <
1050     (f64 (fpextend f16:$src)),
1051     (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
1052   >;
1053
1054   // fp_to_fp16 patterns
1055   def : GCNPat <
1056     (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1057     (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
1058   >;
1059
1060   def : GCNPat <
1061     (i32 (fp_to_sint f16:$src)),
1062     (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
1063   >;
1064
1065   def : GCNPat <
1066     (i32 (fp_to_uint f16:$src)),
1067     (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
1068   >;
1069
1070   def : GCNPat <
1071     (f16 (sint_to_fp i32:$src)),
1072     (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_I32_e32 VSrc_b32:$src))
1073   >;
1074
1075   def : GCNPat <
1076     (f16 (uint_to_fp i32:$src)),
1077     (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
1078   >;
1079 }
1080
1081 let SubtargetPredicate = NotHasTrue16BitInsts in
1082 defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
1083
1084 let SubtargetPredicate = HasTrue16BitInsts in
1085 defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64>;
1086
1087 //===----------------------------------------------------------------------===//
1088 // VOP2 Patterns
1089 //===----------------------------------------------------------------------===//
1090
1091 // NoMods pattern used for mac. If there are any source modifiers then it's
1092 // better to select mad instead of mac.
1093 class FMADPat <ValueType vt, Instruction inst>
1094   : GCNPat <(vt (any_fmad (vt (VOP3NoMods vt:$src0)),
1095                           (vt (VOP3NoMods vt:$src1)),
1096                           (vt (VOP3NoMods vt:$src2)))),
1097     (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
1098           SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
1099 >;
1100
1101 // Prefer mac form when there are no modifiers.
1102 let AddedComplexity = 9 in {
1103 let OtherPredicates = [HasMadMacF32Insts] in
1104 def : FMADPat <f32, V_MAC_F32_e64>;
1105
1106 // Don't allow source modifiers. If there are any source modifiers then it's
1107 // better to select mad instead of mac.
1108 let SubtargetPredicate = isGFX6GFX7GFX10,
1109     OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
1110 def : GCNPat <
1111       (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0),
1112                                     (VOP3NoMods f32:$src1)),
1113                  (VOP3NoMods f32:$src2))),
1114       (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
1115                             SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
1116 >;
1117
1118 // Don't allow source modifiers. If there are any source modifiers then it's
1119 // better to select fma instead of fmac.
1120 let SubtargetPredicate = HasFmaLegacy32 in
1121 def : GCNPat <
1122       (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0),
1123                                   (VOP3NoMods f32:$src1),
1124                                   (VOP3NoMods f32:$src2))),
1125       (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
1126                              SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
1127 >;
1128
1129 let SubtargetPredicate = Has16BitInsts in
1130 def : FMADPat <f16, V_MAC_F16_e64>;
1131 } // AddedComplexity = 9
1132
1133 let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in
1134 def : GCNPat <
1135       (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod),
1136                                     (VOP3Mods f32:$src1, i32:$src1_mod)),
1137                  (VOP3Mods f32:$src2, i32:$src2_mod))),
1138       (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1,
1139                         $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
1140 >;
1141
1142 class VOPSelectModsPat <ValueType vt> : GCNPat <
1143   (vt (select i1:$src0, (VOP3ModsNonCanonicalizing vt:$src1, i32:$src1_mods),
1144                         (VOP3ModsNonCanonicalizing vt:$src2, i32:$src2_mods))),
1145   (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2,
1146                      FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0)
1147 >;
1148
1149 class VOPSelectPat <ValueType vt> : GCNPat <
1150   (vt (select i1:$src0, vt:$src1, vt:$src2)),
1151   (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
1152 >;
1153
1154 def : VOPSelectModsPat <i32>;
1155 def : VOPSelectModsPat <f32>;
1156 def : VOPSelectPat <f16>;
1157 def : VOPSelectPat <i16>;
1158
1159 let AddedComplexity = 1 in {
1160 def : GCNPat <
1161   (i32 (add (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), i32:$val)),
1162   (V_BCNT_U32_B32_e64 $popcnt, $val)
1163 >;
1164 }
1165
1166 def : GCNPat <
1167   (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)),
1168   (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))
1169 >;
1170
1171 def : GCNPat <
1172   (i16 (add (i16 (trunc (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)))), i16:$val)),
1173   (V_BCNT_U32_B32_e64 $popcnt, $val)
1174 >;
1175
1176 def : GCNPat <
1177   (i64 (DivergentUnaryFrag<ctpop> i64:$src)),
1178   (REG_SEQUENCE VReg_64,
1179     (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)),
1180       (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0,
1181       (i32 (V_MOV_B32_e32 (i32 0))), sub1)
1182 >;
1183
1184 /********** ============================================ **********/
1185 /********** Extraction, Insertion, Building and Casting  **********/
1186 /********** ============================================ **********/
1187
1188 // Special case for 2 element vectors. REQ_SEQUENCE produces better code
1189 // than an INSERT_SUBREG.
1190 multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> {
1191   def : GCNPat <
1192     (insertelt vec_type:$vec, elem_type:$elem, 0),
1193     (REG_SEQUENCE RC, $elem, sub0, (elem_type (EXTRACT_SUBREG $vec, sub1)), sub1)
1194   >;
1195
1196   def : GCNPat <
1197     (insertelt vec_type:$vec, elem_type:$elem, 1),
1198     (REG_SEQUENCE RC, (elem_type (EXTRACT_SUBREG $vec, sub0)), sub0, $elem, sub1)
1199   >;
1200 }
1201
1202 foreach Index = 0-1 in {
1203   def Extract_Element_v2i32_#Index : Extract_Element <
1204     i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
1205   >;
1206
1207   def Extract_Element_v2f32_#Index : Extract_Element <
1208     f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
1209   >;
1210 }
1211
1212 defm : Insert_Element_V2 <SReg_64, i32, v2i32>;
1213 defm : Insert_Element_V2 <SReg_64, f32, v2f32>;
1214
1215 foreach Index = 0-2 in {
1216   def Extract_Element_v3i32_#Index : Extract_Element <
1217     i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
1218   >;
1219   def Insert_Element_v3i32_#Index : Insert_Element <
1220     i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
1221   >;
1222
1223   def Extract_Element_v3f32_#Index : Extract_Element <
1224     f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
1225   >;
1226   def Insert_Element_v3f32_#Index : Insert_Element <
1227     f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
1228   >;
1229 }
1230
1231 foreach Index = 0-3 in {
1232   def Extract_Element_v4i32_#Index : Extract_Element <
1233     i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
1234   >;
1235   def Insert_Element_v4i32_#Index : Insert_Element <
1236     i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
1237   >;
1238
1239   def Extract_Element_v4f32_#Index : Extract_Element <
1240     f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
1241   >;
1242   def Insert_Element_v4f32_#Index : Insert_Element <
1243     f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
1244   >;
1245 }
1246
1247 foreach Index = 0-4 in {
1248   def Extract_Element_v5i32_#Index : Extract_Element <
1249     i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
1250   >;
1251   def Insert_Element_v5i32_#Index : Insert_Element <
1252     i32, v5i32, Index, !cast<SubRegIndex>(sub#Index)
1253   >;
1254
1255   def Extract_Element_v5f32_#Index : Extract_Element <
1256     f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
1257   >;
1258   def Insert_Element_v5f32_#Index : Insert_Element <
1259     f32, v5f32, Index, !cast<SubRegIndex>(sub#Index)
1260   >;
1261 }
1262
1263 foreach Index = 0-5 in {
1264   def Extract_Element_v6i32_#Index : Extract_Element <
1265     i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
1266   >;
1267   def Insert_Element_v6i32_#Index : Insert_Element <
1268     i32, v6i32, Index, !cast<SubRegIndex>(sub#Index)
1269   >;
1270
1271   def Extract_Element_v6f32_#Index : Extract_Element <
1272     f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
1273   >;
1274   def Insert_Element_v6f32_#Index : Insert_Element <
1275     f32, v6f32, Index, !cast<SubRegIndex>(sub#Index)
1276   >;
1277 }
1278
1279 foreach Index = 0-6 in {
1280   def Extract_Element_v7i32_#Index : Extract_Element <
1281     i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
1282   >;
1283   def Insert_Element_v7i32_#Index : Insert_Element <
1284     i32, v7i32, Index, !cast<SubRegIndex>(sub#Index)
1285   >;
1286
1287   def Extract_Element_v7f32_#Index : Extract_Element <
1288     f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
1289   >;
1290   def Insert_Element_v7f32_#Index : Insert_Element <
1291     f32, v7f32, Index, !cast<SubRegIndex>(sub#Index)
1292   >;
1293 }
1294
1295 foreach Index = 0-7 in {
1296   def Extract_Element_v8i32_#Index : Extract_Element <
1297     i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
1298   >;
1299   def Insert_Element_v8i32_#Index : Insert_Element <
1300     i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
1301   >;
1302
1303   def Extract_Element_v8f32_#Index : Extract_Element <
1304     f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
1305   >;
1306   def Insert_Element_v8f32_#Index : Insert_Element <
1307     f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
1308   >;
1309 }
1310
1311 foreach Index = 0-8 in {
1312   def Extract_Element_v9i32_#Index : Extract_Element <
1313     i32, v9i32, Index, !cast<SubRegIndex>(sub#Index)
1314   >;
1315   def Insert_Element_v9i32_#Index : Insert_Element <
1316     i32, v9i32, Index, !cast<SubRegIndex>(sub#Index)
1317   >;
1318
1319   def Extract_Element_v9f32_#Index : Extract_Element <
1320     f32, v9f32, Index, !cast<SubRegIndex>(sub#Index)
1321   >;
1322   def Insert_Element_v9f32_#Index : Insert_Element <
1323     f32, v9f32, Index, !cast<SubRegIndex>(sub#Index)
1324   >;
1325 }
1326
1327 foreach Index = 0-9 in {
1328   def Extract_Element_v10i32_#Index : Extract_Element <
1329     i32, v10i32, Index, !cast<SubRegIndex>(sub#Index)
1330   >;
1331   def Insert_Element_v10i32_#Index : Insert_Element <
1332     i32, v10i32, Index, !cast<SubRegIndex>(sub#Index)
1333   >;
1334
1335   def Extract_Element_v10f32_#Index : Extract_Element <
1336     f32, v10f32, Index, !cast<SubRegIndex>(sub#Index)
1337   >;
1338   def Insert_Element_v10f32_#Index : Insert_Element <
1339     f32, v10f32, Index, !cast<SubRegIndex>(sub#Index)
1340   >;
1341 }
1342
1343 foreach Index = 0-10 in {
1344   def Extract_Element_v11i32_#Index : Extract_Element <
1345     i32, v11i32, Index, !cast<SubRegIndex>(sub#Index)
1346   >;
1347   def Insert_Element_v11i32_#Index : Insert_Element <
1348     i32, v11i32, Index, !cast<SubRegIndex>(sub#Index)
1349   >;
1350
1351   def Extract_Element_v11f32_#Index : Extract_Element <
1352     f32, v11f32, Index, !cast<SubRegIndex>(sub#Index)
1353   >;
1354   def Insert_Element_v11f32_#Index : Insert_Element <
1355     f32, v11f32, Index, !cast<SubRegIndex>(sub#Index)
1356   >;
1357 }
1358
1359 foreach Index = 0-11 in {
1360   def Extract_Element_v12i32_#Index : Extract_Element <
1361     i32, v12i32, Index, !cast<SubRegIndex>(sub#Index)
1362   >;
1363   def Insert_Element_v12i32_#Index : Insert_Element <
1364     i32, v12i32, Index, !cast<SubRegIndex>(sub#Index)
1365   >;
1366
1367   def Extract_Element_v12f32_#Index : Extract_Element <
1368     f32, v12f32, Index, !cast<SubRegIndex>(sub#Index)
1369   >;
1370   def Insert_Element_v12f32_#Index : Insert_Element <
1371     f32, v12f32, Index, !cast<SubRegIndex>(sub#Index)
1372   >;
1373 }
1374
1375 foreach Index = 0-15 in {
1376   def Extract_Element_v16i32_#Index : Extract_Element <
1377     i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
1378   >;
1379   def Insert_Element_v16i32_#Index : Insert_Element <
1380     i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
1381   >;
1382
1383   def Extract_Element_v16f32_#Index : Extract_Element <
1384     f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
1385   >;
1386   def Insert_Element_v16f32_#Index : Insert_Element <
1387     f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
1388   >;
1389 }
1390
1391
1392 foreach Index = 0-31 in {
1393   def Extract_Element_v32i32_#Index : Extract_Element <
1394     i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
1395   >;
1396
1397   def Insert_Element_v32i32_#Index : Insert_Element <
1398     i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
1399   >;
1400
1401   def Extract_Element_v32f32_#Index : Extract_Element <
1402     f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
1403   >;
1404
1405   def Insert_Element_v32f32_#Index : Insert_Element <
1406     f32, v32f32, Index, !cast<SubRegIndex>(sub#Index)
1407   >;
1408 }
1409
1410 // FIXME: Why do only some of these type combinations for SReg and
1411 // VReg?
1412 // 16-bit bitcast
1413 def : BitConvert <i16, f16, VGPR_32>;
1414 def : BitConvert <f16, i16, VGPR_32>;
1415 def : BitConvert <i16, f16, SReg_32>;
1416 def : BitConvert <f16, i16, SReg_32>;
1417
1418 // 32-bit bitcast
1419 def : BitConvert <i32, f32, VGPR_32>;
1420 def : BitConvert <f32, i32, VGPR_32>;
1421 def : BitConvert <i32, f32, SReg_32>;
1422 def : BitConvert <f32, i32, SReg_32>;
1423 def : BitConvert <v2i16, i32, SReg_32>;
1424 def : BitConvert <i32, v2i16, SReg_32>;
1425 def : BitConvert <v2f16, i32, SReg_32>;
1426 def : BitConvert <i32, v2f16, SReg_32>;
1427 def : BitConvert <v2i16, v2f16, SReg_32>;
1428 def : BitConvert <v2f16, v2i16, SReg_32>;
1429 def : BitConvert <v2f16, f32, SReg_32>;
1430 def : BitConvert <f32, v2f16, SReg_32>;
1431 def : BitConvert <v2i16, f32, SReg_32>;
1432 def : BitConvert <f32, v2i16, SReg_32>;
1433
1434 // 64-bit bitcast
1435 def : BitConvert <i64, f64, VReg_64>;
1436 def : BitConvert <f64, i64, VReg_64>;
1437 def : BitConvert <v2i32, v2f32, VReg_64>;
1438 def : BitConvert <v2f32, v2i32, VReg_64>;
1439 def : BitConvert <i64, v2i32, VReg_64>;
1440 def : BitConvert <v2i32, i64, VReg_64>;
1441 def : BitConvert <i64, v2f32, VReg_64>;
1442 def : BitConvert <v2f32, i64, VReg_64>;
1443 def : BitConvert <f64, v2f32, VReg_64>;
1444 def : BitConvert <v2f32, f64, VReg_64>;
1445 def : BitConvert <f64, v2i32, VReg_64>;
1446 def : BitConvert <v2i32, f64, VReg_64>;
1447 def : BitConvert <v4i16, v4f16, VReg_64>;
1448 def : BitConvert <v4f16, v4i16, VReg_64>;
1449
1450 // FIXME: Make SGPR
1451 def : BitConvert <v2i32, v4f16, VReg_64>;
1452 def : BitConvert <v4f16, v2i32, VReg_64>;
1453 def : BitConvert <v2i32, v4f16, VReg_64>;
1454 def : BitConvert <v2i32, v4i16, VReg_64>;
1455 def : BitConvert <v4i16, v2i32, VReg_64>;
1456 def : BitConvert <v2f32, v4f16, VReg_64>;
1457 def : BitConvert <v4f16, v2f32, VReg_64>;
1458 def : BitConvert <v2f32, v4i16, VReg_64>;
1459 def : BitConvert <v4i16, v2f32, VReg_64>;
1460 def : BitConvert <v4i16, f64, VReg_64>;
1461 def : BitConvert <v4f16, f64, VReg_64>;
1462 def : BitConvert <f64, v4i16, VReg_64>;
1463 def : BitConvert <f64, v4f16, VReg_64>;
1464 def : BitConvert <v4i16, i64, VReg_64>;
1465 def : BitConvert <v4f16, i64, VReg_64>;
1466 def : BitConvert <i64, v4i16, VReg_64>;
1467 def : BitConvert <i64, v4f16, VReg_64>;
1468
1469 def : BitConvert <v4i32, v4f32, VReg_128>;
1470 def : BitConvert <v4f32, v4i32, VReg_128>;
1471
1472 // 96-bit bitcast
1473 def : BitConvert <v3i32, v3f32, SGPR_96>;
1474 def : BitConvert <v3f32, v3i32, SGPR_96>;
1475
1476 // 128-bit bitcast
1477 def : BitConvert <v2i64, v4i32, SReg_128>;
1478 def : BitConvert <v4i32, v2i64, SReg_128>;
1479 def : BitConvert <v2f64, v4f32, VReg_128>;
1480 def : BitConvert <v2f64, v4i32, VReg_128>;
1481 def : BitConvert <v4f32, v2f64, VReg_128>;
1482 def : BitConvert <v4i32, v2f64, VReg_128>;
1483 def : BitConvert <v2i64, v2f64, VReg_128>;
1484 def : BitConvert <v2f64, v2i64, VReg_128>;
1485 def : BitConvert <v4f32, v2i64, VReg_128>;
1486 def : BitConvert <v2i64, v4f32, VReg_128>;
1487 def : BitConvert <v8i16, v4i32, SReg_128>;
1488 def : BitConvert <v4i32, v8i16, SReg_128>;
1489 def : BitConvert <v8f16, v4f32, VReg_128>;
1490 def : BitConvert <v8f16, v4i32, VReg_128>;
1491 def : BitConvert <v4f32, v8f16, VReg_128>;
1492 def : BitConvert <v4i32, v8f16, VReg_128>;
1493 def : BitConvert <v8i16, v8f16, VReg_128>;
1494 def : BitConvert <v8f16, v8i16, VReg_128>;
1495 def : BitConvert <v4f32, v8i16, VReg_128>;
1496 def : BitConvert <v8i16, v4f32, VReg_128>;
1497 def : BitConvert <v8i16, v8f16, SReg_128>;
1498 def : BitConvert <v8i16, v2i64, SReg_128>;
1499 def : BitConvert <v8i16, v2f64, SReg_128>;
1500 def : BitConvert <v8f16, v2i64, SReg_128>;
1501 def : BitConvert <v8f16, v2f64, SReg_128>;
1502 def : BitConvert <v8f16, v8i16, SReg_128>;
1503 def : BitConvert <v2i64, v8i16, SReg_128>;
1504 def : BitConvert <v2f64, v8i16, SReg_128>;
1505 def : BitConvert <v2i64, v8f16, SReg_128>;
1506 def : BitConvert <v2f64, v8f16, SReg_128>;
1507
1508 // 160-bit bitcast
1509 def : BitConvert <v5i32, v5f32, SReg_160>;
1510 def : BitConvert <v5f32, v5i32, SReg_160>;
1511 def : BitConvert <v5i32, v5f32, VReg_160>;
1512 def : BitConvert <v5f32, v5i32, VReg_160>;
1513
1514 // 192-bit bitcast
1515 def : BitConvert <v6i32, v6f32, SReg_192>;
1516 def : BitConvert <v6f32, v6i32, SReg_192>;
1517 def : BitConvert <v6i32, v6f32, VReg_192>;
1518 def : BitConvert <v6f32, v6i32, VReg_192>;
1519 def : BitConvert <v3i64, v3f64, VReg_192>;
1520 def : BitConvert <v3f64, v3i64, VReg_192>;
1521 def : BitConvert <v3i64, v6i32, VReg_192>;
1522 def : BitConvert <v3i64, v6f32, VReg_192>;
1523 def : BitConvert <v3f64, v6i32, VReg_192>;
1524 def : BitConvert <v3f64, v6f32, VReg_192>;
1525 def : BitConvert <v6i32, v3i64, VReg_192>;
1526 def : BitConvert <v6f32, v3i64, VReg_192>;
1527 def : BitConvert <v6i32, v3f64, VReg_192>;
1528 def : BitConvert <v6f32, v3f64, VReg_192>;
1529
1530 // 224-bit bitcast
1531 def : BitConvert <v7i32, v7f32, SReg_224>;
1532 def : BitConvert <v7f32, v7i32, SReg_224>;
1533 def : BitConvert <v7i32, v7f32, VReg_224>;
1534 def : BitConvert <v7f32, v7i32, VReg_224>;
1535
1536 // 256-bit bitcast
1537 def : BitConvert <v8i32, v8f32, SReg_256>;
1538 def : BitConvert <v8f32, v8i32, SReg_256>;
1539 def : BitConvert <v8i32, v8f32, VReg_256>;
1540 def : BitConvert <v8f32, v8i32, VReg_256>;
1541 def : BitConvert <v4i64, v4f64, VReg_256>;
1542 def : BitConvert <v4f64, v4i64, VReg_256>;
1543 def : BitConvert <v4i64, v8i32, VReg_256>;
1544 def : BitConvert <v4i64, v8f32, VReg_256>;
1545 def : BitConvert <v4f64, v8i32, VReg_256>;
1546 def : BitConvert <v4f64, v8f32, VReg_256>;
1547 def : BitConvert <v8i32, v4i64, VReg_256>;
1548 def : BitConvert <v8f32, v4i64, VReg_256>;
1549 def : BitConvert <v8i32, v4f64, VReg_256>;
1550 def : BitConvert <v8f32, v4f64, VReg_256>;
1551 def : BitConvert <v16i16, v16f16, SReg_256>;
1552 def : BitConvert <v16f16, v16i16, SReg_256>;
1553 def : BitConvert <v16i16, v16f16, VReg_256>;
1554 def : BitConvert <v16f16, v16i16, VReg_256>;
1555 def : BitConvert <v16f16, v8i32, VReg_256>;
1556 def : BitConvert <v16i16, v8i32, VReg_256>;
1557 def : BitConvert <v16f16, v8f32, VReg_256>;
1558 def : BitConvert <v16i16, v8f32, VReg_256>;
1559 def : BitConvert <v8i32, v16f16, VReg_256>;
1560 def : BitConvert <v8i32, v16i16, VReg_256>;
1561 def : BitConvert <v8f32, v16f16, VReg_256>;
1562 def : BitConvert <v8f32, v16i16, VReg_256>;
1563 def : BitConvert <v16f16, v4i64, VReg_256>;
1564 def : BitConvert <v16i16, v4i64, VReg_256>;
1565 def : BitConvert <v16f16, v4f64, VReg_256>;
1566 def : BitConvert <v16i16, v4f64, VReg_256>;
1567 def : BitConvert <v4i64, v16f16, VReg_256>;
1568 def : BitConvert <v4i64, v16i16, VReg_256>;
1569 def : BitConvert <v4f64, v16f16, VReg_256>;
1570 def : BitConvert <v4f64, v16i16, VReg_256>;
1571
1572 // 288-bit bitcast
1573 def : BitConvert <v9i32, v9f32, SReg_288>;
1574 def : BitConvert <v9f32, v9i32, SReg_288>;
1575 def : BitConvert <v9i32, v9f32, VReg_288>;
1576 def : BitConvert <v9f32, v9i32, VReg_288>;
1577
1578 // 320-bit bitcast
1579 def : BitConvert <v10i32, v10f32, SReg_320>;
1580 def : BitConvert <v10f32, v10i32, SReg_320>;
1581 def : BitConvert <v10i32, v10f32, VReg_320>;
1582 def : BitConvert <v10f32, v10i32, VReg_320>;
1583
1584 // 320-bit bitcast
1585 def : BitConvert <v11i32, v11f32, SReg_352>;
1586 def : BitConvert <v11f32, v11i32, SReg_352>;
1587 def : BitConvert <v11i32, v11f32, VReg_352>;
1588 def : BitConvert <v11f32, v11i32, VReg_352>;
1589
1590 // 384-bit bitcast
1591 def : BitConvert <v12i32, v12f32, SReg_384>;
1592 def : BitConvert <v12f32, v12i32, SReg_384>;
1593 def : BitConvert <v12i32, v12f32, VReg_384>;
1594 def : BitConvert <v12f32, v12i32, VReg_384>;
1595
1596 // 512-bit bitcast
1597 def : BitConvert <v16i32, v16f32, VReg_512>;
1598 def : BitConvert <v16f32, v16i32, VReg_512>;
1599 def : BitConvert <v8i64,  v8f64,  VReg_512>;
1600 def : BitConvert <v8f64,  v8i64,  VReg_512>;
1601 def : BitConvert <v8i64,  v16i32, VReg_512>;
1602 def : BitConvert <v8f64,  v16i32, VReg_512>;
1603 def : BitConvert <v16i32, v8i64,  VReg_512>;
1604 def : BitConvert <v16i32, v8f64,  VReg_512>;
1605 def : BitConvert <v8i64,  v16f32, VReg_512>;
1606 def : BitConvert <v8f64,  v16f32, VReg_512>;
1607 def : BitConvert <v16f32, v8i64,  VReg_512>;
1608 def : BitConvert <v16f32, v8f64,  VReg_512>;
1609
1610 // 1024-bit bitcast
1611 def : BitConvert <v32i32, v32f32, VReg_1024>;
1612 def : BitConvert <v32f32, v32i32, VReg_1024>;
1613 def : BitConvert <v16i64, v16f64, VReg_1024>;
1614 def : BitConvert <v16f64, v16i64, VReg_1024>;
1615 def : BitConvert <v16i64, v32i32, VReg_1024>;
1616 def : BitConvert <v32i32, v16i64, VReg_1024>;
1617 def : BitConvert <v16f64, v32f32, VReg_1024>;
1618 def : BitConvert <v32f32, v16f64, VReg_1024>;
1619 def : BitConvert <v16i64, v32f32, VReg_1024>;
1620 def : BitConvert <v32i32, v16f64, VReg_1024>;
1621 def : BitConvert <v16f64, v32i32, VReg_1024>;
1622 def : BitConvert <v32f32, v16i64, VReg_1024>;
1623
1624
1625 /********** =================== **********/
1626 /********** Src & Dst modifiers **********/
1627 /********** =================== **********/
1628
1629
1630 // If denormals are not enabled, it only impacts the compare of the
1631 // inputs. The output result is not flushed.
1632 class ClampPat<Instruction inst, ValueType vt> : GCNPat <
1633   (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))),
1634   (inst i32:$src0_modifiers, vt:$src0,
1635         i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE)
1636 >;
1637
1638 def : ClampPat<V_MAX_F32_e64, f32>;
1639 def : ClampPat<V_MAX_F64_e64, f64>;
1640 let SubtargetPredicate = NotHasTrue16BitInsts in
1641 def : ClampPat<V_MAX_F16_e64, f16>;
1642 let SubtargetPredicate = HasTrue16BitInsts in
1643 def : ClampPat<V_MAX_F16_t16_e64, f16>;
1644
1645 let SubtargetPredicate = HasVOP3PInsts in {
1646 def : GCNPat <
1647   (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
1648   (V_PK_MAX_F16 $src0_modifiers, $src0,
1649                 $src0_modifiers, $src0, DSTCLAMP.ENABLE)
1650 >;
1651 }
1652
1653
1654 /********** ================================ **********/
1655 /********** Floating point absolute/negative **********/
1656 /********** ================================ **********/
1657
1658 def : GCNPat <
1659   (UniformUnaryFrag<fneg> (fabs (f32 SReg_32:$src))),
1660   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit
1661 >;
1662
1663 def : GCNPat <
1664   (UniformUnaryFrag<fabs> (f32 SReg_32:$src)),
1665   (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff)))
1666 >;
1667
1668 def : GCNPat <
1669   (UniformUnaryFrag<fneg> (f32 SReg_32:$src)),
1670   (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
1671 >;
1672
1673 def : GCNPat <
1674   (UniformUnaryFrag<fneg> (f16 SReg_32:$src)),
1675   (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
1676 >;
1677
1678 def : GCNPat <
1679   (UniformUnaryFrag<fabs> (f16 SReg_32:$src)),
1680   (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
1681 >;
1682
1683 def : GCNPat <
1684   (UniformUnaryFrag<fneg> (fabs (f16 SReg_32:$src))),
1685   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
1686 >;
1687
1688 def : GCNPat <
1689   (UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)),
1690   (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000)))
1691 >;
1692
1693 def : GCNPat <
1694   (UniformUnaryFrag<fabs> (v2f16 SReg_32:$src)),
1695   (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff)))
1696 >;
1697
1698 // This is really (fneg (fabs v2f16:$src))
1699 //
1700 // fabs is not reported as free because there is modifier for it in
1701 // VOP3P instructions, so it is turned into the bit op.
1702 def : GCNPat <
1703   (UniformUnaryFrag<fneg> (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
1704   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
1705 >;
1706
1707 def : GCNPat <
1708   (UniformUnaryFrag<fneg> (v2f16 (fabs SReg_32:$src))),
1709   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
1710 >;
1711
1712
1713 // COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
1714 // of the real value.
1715 def : GCNPat <
1716   (UniformUnaryFrag<fneg> (v2f32 SReg_64:$src)),
1717   (v2f32 (REG_SEQUENCE SReg_64,
1718          (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
1719                                            (i32 (S_MOV_B32 (i32 0x80000000)))),
1720                                  SReg_32)), sub0,
1721          (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)),
1722                                            (i32 (S_MOV_B32 (i32 0x80000000)))),
1723                                  SReg_32)), sub1))
1724 >;
1725
1726 def : GCNPat <
1727   (UniformUnaryFrag<fabs> (v2f32 SReg_64:$src)),
1728   (v2f32 (REG_SEQUENCE SReg_64,
1729          (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
1730                                            (i32 (S_MOV_B32 (i32 0x7fffffff)))),
1731                                  SReg_32)), sub0,
1732          (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub1)),
1733                                            (i32 (S_MOV_B32 (i32 0x7fffffff)))),
1734                                  SReg_32)), sub1))
1735 >;
1736
1737 def : GCNPat <
1738   (UniformUnaryFrag<fneg> (fabs (v2f32 SReg_64:$src))),
1739   (v2f32 (REG_SEQUENCE SReg_64,
1740          (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub0)),
1741                                            (i32 (S_MOV_B32 (i32 0x80000000)))),
1742                                  SReg_32)), sub0,
1743          (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub1)),
1744                                            (i32 (S_MOV_B32 (i32 0x80000000)))),
1745                                  SReg_32)), sub1))
1746 >;
1747
1748 // FIXME: Use S_BITSET0_B32/B64?
1749 def : GCNPat <
1750   (UniformUnaryFrag<fabs> (f64 SReg_64:$src)),
1751   (REG_SEQUENCE SReg_64,
1752     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
1753     sub0,
1754     (i32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
1755                    (S_MOV_B32 (i32 0x7fffffff))), SReg_32)), // Set sign bit.
1756      sub1)
1757 >;
1758
1759 def : GCNPat <
1760   (UniformUnaryFrag<fneg> (f64 SReg_64:$src)),
1761   (REG_SEQUENCE SReg_64,
1762     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
1763     sub0,
1764     (i32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
1765                    (i32 (S_MOV_B32 (i32 0x80000000)))), SReg_32)),
1766     sub1)
1767 >;
1768
1769 def : GCNPat <
1770   (UniformUnaryFrag<fneg> (fabs (f64 SReg_64:$src))),
1771   (REG_SEQUENCE SReg_64,
1772     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
1773     sub0,
1774     (i32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
1775                   (S_MOV_B32 (i32 0x80000000))), SReg_32)),// Set sign bit.
1776     sub1)
1777 >;
1778
1779
1780 def : GCNPat <
1781   (fneg (fabs (f32 VGPR_32:$src))),
1782   (V_OR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) // Set sign bit
1783 >;
1784
1785 def : GCNPat <
1786   (fabs (f32 VGPR_32:$src)),
1787   (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src)
1788 >;
1789
1790 def : GCNPat <
1791   (fneg (f32 VGPR_32:$src)),
1792   (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
1793 >;
1794
1795 def : GCNPat <
1796   (fabs (f16 VGPR_32:$src)),
1797   (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
1798 >;
1799
1800 def : GCNPat <
1801   (fneg (f16 VGPR_32:$src)),
1802   (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
1803 >;
1804
1805 def : GCNPat <
1806   (fneg (fabs (f16 VGPR_32:$src))),
1807   (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
1808 >;
1809
1810 def : GCNPat <
1811   (fneg (v2f16 VGPR_32:$src)),
1812   (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
1813 >;
1814
1815 def : GCNPat <
1816   (fabs (v2f16 VGPR_32:$src)),
1817   (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src)
1818 >;
1819
1820 def : GCNPat <
1821   (fneg (v2f16 (fabs VGPR_32:$src))),
1822   (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
1823 >;
1824
1825 def : GCNPat <
1826   (fabs (f64 VReg_64:$src)),
1827   (REG_SEQUENCE VReg_64,
1828     (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
1829     sub0,
1830     (V_AND_B32_e64 (i32 (S_MOV_B32 (i32 0x7fffffff))),
1831         (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))),
1832      sub1)
1833 >;
1834
1835 def : GCNPat <
1836   (fneg (f64 VReg_64:$src)),
1837   (REG_SEQUENCE VReg_64,
1838     (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
1839     sub0,
1840     (V_XOR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))),
1841         (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))),
1842     sub1)
1843 >;
1844
1845 def : GCNPat <
1846   (fneg (fabs (f64 VReg_64:$src))),
1847   (REG_SEQUENCE VReg_64,
1848     (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
1849     sub0,
1850     (V_OR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))),
1851         (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))),
1852     sub1)
1853 >;
1854
1855 def : GCNPat <
1856   (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)),
1857   (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
1858                 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0,
1859                 0, 0, 0, 0, 0)
1860 > {
1861   let SubtargetPredicate = HasPackedFP32Ops;
1862 }
1863
1864 def : GCNPat <
1865   (fcopysign f16:$src0, f16:$src1),
1866   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
1867 >;
1868
1869 def : GCNPat <
1870   (fcopysign f32:$src0, f16:$src1),
1871   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
1872              (V_LSHLREV_B32_e64 (i32 16), $src1))
1873 >;
1874
1875 def : GCNPat <
1876   (fcopysign f64:$src0, f16:$src1),
1877   (REG_SEQUENCE SReg_64,
1878     (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
1879     (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
1880                (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
1881 >;
1882
1883 def : GCNPat <
1884   (fcopysign f16:$src0, f32:$src1),
1885   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
1886              (V_LSHRREV_B32_e64 (i32 16), $src1))
1887 >;
1888
1889 def : GCNPat <
1890   (fcopysign f16:$src0, f64:$src1),
1891   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
1892              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
1893 >;
1894
1895 /********** ================== **********/
1896 /********** Immediate Patterns **********/
1897 /********** ================== **********/
1898
1899 def : GCNPat <
1900   (VGPRImm<(i32 imm)>:$imm),
1901   (V_MOV_B32_e32 imm:$imm)
1902 >;
1903
1904 def : GCNPat <
1905   (VGPRImm<(f32 fpimm)>:$imm),
1906   (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
1907 >;
1908
1909 def : GCNPat <
1910   (i32 imm:$imm),
1911   (S_MOV_B32 imm:$imm)
1912 >;
1913
1914 def : GCNPat <
1915   (VGPRImm<(SIlds tglobaladdr:$ga)>),
1916   (V_MOV_B32_e32 $ga)
1917 >;
1918
1919 def : GCNPat <
1920   (SIlds tglobaladdr:$ga),
1921   (S_MOV_B32 $ga)
1922 >;
1923
1924 // FIXME: Workaround for ordering issue with peephole optimizer where
1925 // a register class copy interferes with immediate folding.  Should
1926 // use s_mov_b32, which can be shrunk to s_movk_i32
1927 def : GCNPat <
1928   (VGPRImm<(f16 fpimm)>:$imm),
1929   (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
1930 >;
1931
1932 def : GCNPat <
1933   (f32 fpimm:$imm),
1934   (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
1935 >;
1936
1937 def : GCNPat <
1938   (f16 fpimm:$imm),
1939   (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
1940 >;
1941
1942 def : GCNPat <
1943   (p5 frameindex:$fi),
1944   (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
1945 >;
1946
1947 def : GCNPat <
1948   (p5 frameindex:$fi),
1949   (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi)))
1950 >;
1951
1952 def : GCNPat <
1953   (i64 InlineImm64:$imm),
1954   (S_MOV_B64 InlineImm64:$imm)
1955 >;
1956
1957 // XXX - Should this use a s_cmp to set SCC?
1958
1959 // Set to sign-extended 64-bit value (true = -1, false = 0)
1960 def : GCNPat <
1961   (i1 imm:$imm),
1962   (S_MOV_B64 (i64 (as_i64imm $imm)))
1963 > {
1964   let WaveSizePredicate = isWave64;
1965 }
1966
1967 def : GCNPat <
1968   (i1 imm:$imm),
1969   (S_MOV_B32 (i32 (as_i32imm $imm)))
1970 > {
1971   let WaveSizePredicate = isWave32;
1972 }
1973
1974 def : GCNPat <
1975   (f64 InlineImmFP64:$imm),
1976   (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm)))
1977 >;
1978
1979 /********** ================== **********/
1980 /********** Intrinsic Patterns **********/
1981 /********** ================== **********/
1982
1983 def : GCNPat <
1984   (f32 (fpow (VOP3Mods f32:$src0, i32:$src0_mods), (VOP3Mods f32:$src1, i32:$src1_mods))),
1985   (V_EXP_F32_e64 SRCMODS.NONE, (V_MUL_LEGACY_F32_e64 $src1_mods, $src1, SRCMODS.NONE, (V_LOG_F32_e64 $src0_mods, $src0), 0, 0))
1986 >;
1987
1988 def : GCNPat <
1989   (i32 (sext i1:$src0)),
1990   (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1991                      /*src1mod*/(i32 0), /*src1*/(i32 -1), i1:$src0)
1992 >;
1993
1994 class Ext32Pat <SDNode ext> : GCNPat <
1995   (i32 (ext i1:$src0)),
1996   (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
1997                      /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0)
1998 >;
1999
2000 def : Ext32Pat <zext>;
2001 def : Ext32Pat <anyext>;
2002
2003 // The multiplication scales from [0,1) to the unsigned integer range,
2004 // rounding down a bit to avoid unwanted overflow.
2005 def : GCNPat <
2006   (AMDGPUurecip i32:$src0),
2007   (V_CVT_U32_F32_e32
2008     (V_MUL_F32_e32 (i32 CONST.FP_4294966784),
2009                    (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
2010 >;
2011
2012 //===----------------------------------------------------------------------===//
2013 // VOP3 Patterns
2014 //===----------------------------------------------------------------------===//
2015
2016 def : IMad24Pat<V_MAD_I32_I24_e64, 1>;
2017 def : UMad24Pat<V_MAD_U32_U24_e64, 1>;
2018
2019 // BFI patterns
2020
2021 def BFIImm32 : PatFrag<
2022   (ops node:$x, node:$y, node:$z),
2023   (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))),
2024   [{
2025     auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
2026     auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1));
2027     return X && NotX &&
2028       ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue();
2029   }]
2030 >;
2031
2032
2033 // Definition from ISA doc:
2034 // (y & x) | (z & ~x)
2035 def : AMDGPUPatIgnoreCopies <
2036   (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
2037   (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32),
2038                 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32),
2039                 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
2040 >;
2041
2042 // (y & C) | (z & ~C)
2043 def : AMDGPUPatIgnoreCopies <
2044   (BFIImm32 i32:$x, i32:$y, i32:$z),
2045   (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
2046 >;
2047
2048 // 64-bit version
2049 def : AMDGPUPatIgnoreCopies <
2050   (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
2051   (REG_SEQUENCE VReg_64,
2052     (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
2053               (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
2054               (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
2055     (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
2056               (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
2057               (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
2058 >;
2059
2060 // SHA-256 Ch function
2061 // z ^ (x & (y ^ z))
2062 def : AMDGPUPatIgnoreCopies <
2063   (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
2064   (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32),
2065                 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32),
2066                 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
2067 >;
2068
2069 // 64-bit version
2070 def : AMDGPUPatIgnoreCopies <
2071   (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
2072   (REG_SEQUENCE VReg_64,
2073     (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
2074               (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
2075               (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
2076     (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
2077               (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
2078               (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
2079 >;
2080
2081 def : AMDGPUPat <
2082   (fcopysign f32:$src0, f32:$src1),
2083   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1)
2084 >;
2085
2086 def : AMDGPUPat <
2087   (fcopysign f32:$src0, f64:$src1),
2088   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
2089              (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
2090 >;
2091
2092 def : AMDGPUPat <
2093   (fcopysign f64:$src0, f64:$src1),
2094   (REG_SEQUENCE SReg_64,
2095     (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
2096     (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
2097                (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
2098                (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1)
2099 >;
2100
2101 def : AMDGPUPat <
2102   (fcopysign f64:$src0, f32:$src1),
2103   (REG_SEQUENCE SReg_64,
2104     (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
2105     (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)),
2106                (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)),
2107                $src1), sub1)
2108 >;
2109
2110 def : ROTRPattern <V_ALIGNBIT_B32_e64>;
2111
2112 def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
2113           (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2114                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
2115
2116 def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2117           (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2118                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
2119
2120 /********** ====================== **********/
2121 /**********   Indirect addressing  **********/
2122 /********** ====================== **********/
2123
2124 multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
2125   // Extract with offset
2126   def : GCNPat<
2127     (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
2128     (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
2129   >;
2130
2131   // Insert with offset
2132   def : GCNPat<
2133     (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
2134     (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
2135   >;
2136 }
2137
2138 defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
2139 defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
2140 defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
2141 defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">;
2142 defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">;
2143 defm : SI_INDIRECT_Pattern <v11f32, f32, "V11">;
2144 defm : SI_INDIRECT_Pattern <v12f32, f32, "V12">;
2145 defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
2146 defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
2147
2148 defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
2149 defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
2150 defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
2151 defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">;
2152 defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">;
2153 defm : SI_INDIRECT_Pattern <v11i32, i32, "V11">;
2154 defm : SI_INDIRECT_Pattern <v12i32, i32, "V12">;
2155 defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
2156 defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">;
2157
2158 //===----------------------------------------------------------------------===//
2159 // SAD Patterns
2160 //===----------------------------------------------------------------------===//
2161
2162 def : GCNPat <
2163   (add (sub_oneuse (umax i32:$src0, i32:$src1),
2164                    (umin i32:$src0, i32:$src1)),
2165        i32:$src2),
2166   (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
2167 >;
2168
2169 def : GCNPat <
2170   (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
2171                       (sub i32:$src0, i32:$src1),
2172                       (sub i32:$src1, i32:$src0)),
2173        i32:$src2),
2174   (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0))
2175 >;
2176
2177 //===----------------------------------------------------------------------===//
2178 // Conversion Patterns
2179 //===----------------------------------------------------------------------===//
2180 def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)),
2181   (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
2182
2183 // Handle sext_inreg in i64
2184 def : GCNPat <
2185   (i64 (UniformSextInreg<i1> i64:$src)),
2186   (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
2187 >;
2188
2189 def : GCNPat <
2190   (i16 (UniformSextInreg<i1> i16:$src)),
2191   (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
2192 >;
2193
2194 def : GCNPat <
2195   (i16 (UniformSextInreg<i8> i16:$src)),
2196   (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
2197 >;
2198
2199 def : GCNPat <
2200   (i64 (UniformSextInreg<i8> i64:$src)),
2201   (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
2202 >;
2203
2204 def : GCNPat <
2205   (i64 (UniformSextInreg<i16> i64:$src)),
2206   (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
2207 >;
2208
2209 def : GCNPat <
2210   (i64 (UniformSextInreg<i32> i64:$src)),
2211   (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
2212 >;
2213
2214 def : GCNPat<
2215   (i32 (DivergentSextInreg<i1> i32:$src)),
2216   (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
2217
2218 def : GCNPat <
2219   (i16 (DivergentSextInreg<i1> i16:$src)),
2220   (V_BFE_I32_e64 $src, (i32 0), (i32 1))
2221 >;
2222
2223 def : GCNPat <
2224   (i16 (DivergentSextInreg<i8> i16:$src)),
2225   (V_BFE_I32_e64 $src, (i32 0), (i32 8))
2226 >;
2227
2228 def : GCNPat<
2229   (i32 (DivergentSextInreg<i8> i32:$src)),
2230   (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8))
2231 >;
2232
2233 def : GCNPat <
2234   (i32 (DivergentSextInreg<i16> i32:$src)),
2235   (V_BFE_I32_e64 $src, (i32 0), (i32 16))
2236 >;
2237
2238 def : GCNPat <
2239   (i64 (DivergentSextInreg<i1> i64:$src)),
2240   (REG_SEQUENCE VReg_64,
2241     (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0,
2242     (V_ASHRREV_I32_e32  (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1)
2243 >;
2244
2245 def : GCNPat <
2246   (i64 (DivergentSextInreg<i8> i64:$src)),
2247   (REG_SEQUENCE VReg_64,
2248     (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)), sub0,
2249     (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1)
2250 >;
2251
2252 def : GCNPat <
2253   (i64 (DivergentSextInreg<i16> i64:$src)),
2254   (REG_SEQUENCE VReg_64,
2255     (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)), sub0,
2256     (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1)
2257 >;
2258
2259 def : GCNPat <
2260   (i64 (DivergentSextInreg<i32> i64:$src)),
2261   (REG_SEQUENCE VReg_64,
2262     (i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0,
2263     (V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1)
2264 >;
2265
2266 def : GCNPat <
2267   (i64 (zext i32:$src)),
2268   (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
2269 >;
2270
2271 def : GCNPat <
2272   (i64 (anyext i32:$src)),
2273   (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
2274 >;
2275
2276 class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
2277   (i64 (ext i1:$src)),
2278     (REG_SEQUENCE VReg_64,
2279       (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2280                          /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
2281       sub0, (S_MOV_B32 (i32 0)), sub1)
2282 >;
2283
2284
2285 def : ZExt_i64_i1_Pat<zext>;
2286 def : ZExt_i64_i1_Pat<anyext>;
2287
2288 // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
2289 // REG_SEQUENCE patterns don't support instructions with multiple outputs.
2290 def : GCNPat <
2291   (i64 (UniformUnaryFrag<sext> i32:$src)),
2292     (REG_SEQUENCE SReg_64, $src, sub0,
2293     (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
2294 >;
2295
2296 def : GCNPat <
2297   (i64 (DivergentUnaryFrag<sext> i32:$src)),
2298     (REG_SEQUENCE VReg_64, $src, sub0,
2299     (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1)
2300 >;
2301
2302 def : GCNPat <
2303   (i64 (sext i1:$src)),
2304   (REG_SEQUENCE VReg_64,
2305     (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2306                        /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0,
2307     (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2308                        /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1)
2309 >;
2310
2311 class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat <
2312   (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
2313   (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
2314 >;
2315
2316 def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
2317 def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
2318 def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
2319 def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
2320 def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
2321 def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
2322
2323 // If we need to perform a logical operation on i1 values, we need to
2324 // use vector comparisons since there is only one SCC register. Vector
2325 // comparisons may write to a pair of SGPRs or a single SGPR, so treat
2326 // these as 32 or 64-bit comparisons. When legalizing SGPR copies,
2327 // instructions resulting in the copies from SCC to these instructions
2328 // will be moved to the VALU.
2329
2330 let WaveSizePredicate = isWave64 in {
2331 def : GCNPat <
2332   (i1 (and i1:$src0, i1:$src1)),
2333   (S_AND_B64 $src0, $src1)
2334 >;
2335
2336 def : GCNPat <
2337   (i1 (or i1:$src0, i1:$src1)),
2338   (S_OR_B64 $src0, $src1)
2339 >;
2340
2341 def : GCNPat <
2342   (i1 (xor i1:$src0, i1:$src1)),
2343   (S_XOR_B64 $src0, $src1)
2344 >;
2345
2346 def : GCNPat <
2347   (i1 (add i1:$src0, i1:$src1)),
2348   (S_XOR_B64 $src0, $src1)
2349 >;
2350
2351 def : GCNPat <
2352   (i1 (sub i1:$src0, i1:$src1)),
2353   (S_XOR_B64 $src0, $src1)
2354 >;
2355
2356 let AddedComplexity = 1 in {
2357 def : GCNPat <
2358   (i1 (add i1:$src0, (i1 -1))),
2359   (S_NOT_B64 $src0)
2360 >;
2361
2362 def : GCNPat <
2363   (i1 (sub i1:$src0, (i1 -1))),
2364   (S_NOT_B64 $src0)
2365 >;
2366 }
2367 } // end isWave64
2368
2369 let WaveSizePredicate = isWave32 in {
2370 def : GCNPat <
2371   (i1 (and i1:$src0, i1:$src1)),
2372   (S_AND_B32 $src0, $src1)
2373 >;
2374
2375 def : GCNPat <
2376   (i1 (or i1:$src0, i1:$src1)),
2377   (S_OR_B32 $src0, $src1)
2378 >;
2379
2380 def : GCNPat <
2381   (i1 (xor i1:$src0, i1:$src1)),
2382   (S_XOR_B32 $src0, $src1)
2383 >;
2384
2385 def : GCNPat <
2386   (i1 (add i1:$src0, i1:$src1)),
2387   (S_XOR_B32 $src0, $src1)
2388 >;
2389
2390 def : GCNPat <
2391   (i1 (sub i1:$src0, i1:$src1)),
2392   (S_XOR_B32 $src0, $src1)
2393 >;
2394
2395 let AddedComplexity = 1 in {
2396 def : GCNPat <
2397   (i1 (add i1:$src0, (i1 -1))),
2398   (S_NOT_B32 $src0)
2399 >;
2400
2401 def : GCNPat <
2402   (i1 (sub i1:$src0, (i1 -1))),
2403   (S_NOT_B32 $src0)
2404 >;
2405 }
2406 } // end isWave32
2407
2408 def : GCNPat <
2409   (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
2410   (V_NOT_B32_e32 $src0)
2411 >;
2412
2413 def : GCNPat <
2414   (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))),
2415     (REG_SEQUENCE VReg_64,
2416       (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0,
2417       (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1
2418     )
2419 >;
2420
2421 let SubtargetPredicate = NotHasTrue16BitInsts in
2422 def : GCNPat <
2423   (f16 (sint_to_fp i1:$src)),
2424   (V_CVT_F16_F32_e32 (
2425       V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2426                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
2427                         SSrc_i1:$src))
2428 >;
2429
2430 let SubtargetPredicate = HasTrue16BitInsts in
2431 def : GCNPat <
2432   (f16 (sint_to_fp i1:$src)),
2433   (V_CVT_F16_F32_t16_e32 (
2434       V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2435                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
2436                         SSrc_i1:$src))
2437 >;
2438
2439 let SubtargetPredicate = NotHasTrue16BitInsts in
2440 def : GCNPat <
2441   (f16 (uint_to_fp i1:$src)),
2442   (V_CVT_F16_F32_e32 (
2443       V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2444                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
2445                         SSrc_i1:$src))
2446 >;
2447 let SubtargetPredicate = HasTrue16BitInsts in
2448 def : GCNPat <
2449   (f16 (uint_to_fp i1:$src)),
2450   (V_CVT_F16_F32_t16_e32 (
2451       V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2452                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
2453                         SSrc_i1:$src))
2454 >;
2455
2456 def : GCNPat <
2457   (f32 (sint_to_fp i1:$src)),
2458   (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2459                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
2460                         SSrc_i1:$src)
2461 >;
2462
2463 def : GCNPat <
2464   (f32 (uint_to_fp i1:$src)),
2465   (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2466                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
2467                         SSrc_i1:$src)
2468 >;
2469
2470 def : GCNPat <
2471   (f64 (sint_to_fp i1:$src)),
2472   (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2473                                         /*src1mod*/(i32 0), /*src1*/(i32 -1),
2474                                         SSrc_i1:$src))
2475 >;
2476
2477 def : GCNPat <
2478   (f64 (uint_to_fp i1:$src)),
2479   (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2480                                         /*src1mod*/(i32 0), /*src1*/(i32 1),
2481                                         SSrc_i1:$src))
2482 >;
2483
2484 //===----------------------------------------------------------------------===//
2485 // Miscellaneous Patterns
2486 //===----------------------------------------------------------------------===//
2487
2488 // Eliminate a zero extension from an fp16 operation if it already
2489 // zeros the high bits of the 32-bit register.
2490 //
2491 // This is complicated on gfx9+. Some instructions maintain the legacy
2492 // zeroing behavior, but others preserve the high bits. Some have a
2493 // control bit to change the behavior. We can't simply say with
2494 // certainty what the source behavior is without more context on how
2495 // the src is lowered. e.g. fptrunc + fma may be lowered to a
2496 // v_fma_mix* instruction which does not zero, or may not.
2497 def : GCNPat<
2498   (i32 (DivergentUnaryFrag<abs> i32:$src)),
2499   (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>;
2500
2501 let AddedComplexity = 1 in {
2502 def : GCNPat<
2503   (i32 (DivergentUnaryFrag<abs> i32:$src)),
2504   (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{
2505   let SubtargetPredicate = HasAddNoCarryInsts;
2506 }
2507 }  // AddedComplexity = 1
2508
2509 def : GCNPat<
2510   (i32 (DivergentUnaryFrag<zext> i16:$src)),
2511   (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
2512 >;
2513
2514 def : GCNPat<
2515   (i64 (DivergentUnaryFrag<zext> i16:$src)),
2516   (REG_SEQUENCE VReg_64,
2517     (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0,
2518     (S_MOV_B32 (i32 0)), sub1)
2519 >;
2520
2521 def : GCNPat<
2522   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
2523   (COPY VSrc_b16:$src)>;
2524
2525 def : GCNPat <
2526   (i32 (trunc i64:$a)),
2527   (EXTRACT_SUBREG $a, sub0)
2528 >;
2529
2530 def : GCNPat <
2531   (i1 (UniformUnaryFrag<trunc> i32:$a)),
2532   (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1))
2533 >;
2534
2535 def : GCNPat <
2536   (i1 (UniformUnaryFrag<trunc> i16:$a)),
2537   (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1))
2538 >;
2539
2540 def : GCNPat <
2541   (i1 (UniformUnaryFrag<trunc> i64:$a)),
2542   (S_CMP_EQ_U32 (S_AND_B32 (i32 1),
2543                     (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
2544 >;
2545
2546 def : GCNPat <
2547   (i1 (DivergentUnaryFrag<trunc> i32:$a)),
2548   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
2549 >;
2550
2551 def : GCNPat <
2552   (i1 (DivergentUnaryFrag<trunc> i16:$a)),
2553   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
2554 >;
2555
2556 def IMMBitSelConst : SDNodeXForm<imm, [{
2557   return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
2558                                    MVT::i32);
2559 }]>;
2560
2561 // Matching separate SRL and TRUNC instructions
2562 // with dependent operands (SRL dest is source of TRUNC)
2563 // generates three instructions. However, by using bit shifts,
2564 // the V_LSHRREV_B32_e64 result can be directly used in the
2565 // operand of the V_AND_B32_e64 instruction:
2566 // (trunc i32 (srl i32 $a, i32 $b)) ->
2567 // v_and_b32_e64 $a, (1 << $b), $a
2568 // v_cmp_ne_u32_e64 $a, 0, $a
2569
2570 // Handle the VALU case.
2571 def : GCNPat <
2572   (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
2573   (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a),
2574     (i32 0))
2575 >;
2576
2577 // Handle the scalar case.
2578 def : GCNPat <
2579   (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
2580   (S_CMP_LG_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a),
2581     (i32 0))
2582 >;
2583
2584 def : GCNPat <
2585   (i1 (DivergentUnaryFrag<trunc> i64:$a)),
2586   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1),
2587                     (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
2588 >;
2589
2590 def : GCNPat <
2591   (i32 (bswap i32:$a)),
2592   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
2593              (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
2594              (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
2595 >;
2596
2597 // FIXME: This should have been narrowed to i32 during legalization.
2598 // This pattern should also be skipped for GlobalISel
2599 def : GCNPat <
2600   (i64 (bswap i64:$a)),
2601   (REG_SEQUENCE VReg_64,
2602   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
2603              (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
2604                              (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
2605                              (i32 24)),
2606              (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
2607                              (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
2608                              (i32 8))),
2609   sub0,
2610   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)),
2611              (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
2612                              (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
2613                              (i32 24)),
2614              (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
2615                              (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
2616                              (i32 8))),
2617   sub1)
2618 >;
2619
2620 // FIXME: The AddedComplexity should not be needed, but in GlobalISel
2621 // the BFI pattern ends up taking precedence without it.
2622 let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
2623 // Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24)
2624 //
2625 // My reading of the manual suggests we should be using src0 for the
2626 // register value, but this is what seems to work.
2627 def : GCNPat <
2628   (i32 (bswap i32:$a)),
2629   (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
2630 >;
2631
2632 // FIXME: This should have been narrowed to i32 during legalization.
2633 // This pattern should also be skipped for GlobalISel
2634 def : GCNPat <
2635   (i64 (bswap i64:$a)),
2636   (REG_SEQUENCE VReg_64,
2637   (V_PERM_B32_e64  (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
2638               (S_MOV_B32 (i32 0x00010203))),
2639   sub0,
2640   (V_PERM_B32_e64  (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
2641               (S_MOV_B32 (i32 0x00010203))),
2642   sub1)
2643 >;
2644
2645 // Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
2646 // The 12s emit 0s.
2647 def : GCNPat <
2648   (i16 (bswap i16:$a)),
2649   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
2650 >;
2651
2652 def : GCNPat <
2653   (i32 (zext (bswap i16:$a))),
2654   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
2655 >;
2656
2657 // Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
2658 def : GCNPat <
2659   (v2i16 (bswap v2i16:$a)),
2660   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
2661 >;
2662
2663 }
2664
2665 def : GCNPat<
2666   (i64 (DivergentUnaryFrag<bitreverse> i64:$a)),
2667   (REG_SEQUENCE VReg_64,
2668    (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0,
2669    (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
2670
2671 // Prefer selecting to max when legal, but using mul is always valid.
2672 let AddedComplexity = -5 in {
2673
2674 let OtherPredicates = [NotHasTrue16BitInsts] in {
2675 def : GCNPat<
2676   (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
2677   (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
2678 >;
2679
2680 def : GCNPat<
2681   (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
2682   (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
2683 >;
2684 } // End OtherPredicates
2685
2686 let OtherPredicates = [HasTrue16BitInsts] in {
2687 def : GCNPat<
2688   (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
2689   (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
2690 >;
2691
2692 def : GCNPat<
2693   (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
2694   (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
2695 >;
2696 } // End OtherPredicates
2697
2698 def : GCNPat<
2699   (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
2700   (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
2701 >;
2702
2703 def : GCNPat<
2704   (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
2705   (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src)
2706 >;
2707
2708 def : GCNPat<
2709   (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
2710   (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src)
2711 >;
2712
2713 // TODO: Handle fneg like other types.
2714 def : GCNPat<
2715   (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
2716   (V_MUL_F64_e64  0, CONST.FP64_ONE, $src_mods, $src)
2717 >;
2718 } // End AddedComplexity = -5
2719
2720 multiclass SelectCanonicalizeAsMax<
2721   list<Predicate> f32_preds = [],
2722   list<Predicate> f64_preds = [],
2723   list<Predicate> f16_preds = []> {
2724   def : GCNPat<
2725     (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
2726     (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> {
2727     let OtherPredicates = f32_preds;
2728   }
2729
2730   def : GCNPat<
2731     (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
2732     (V_MAX_F64_e64  $src_mods, $src, $src_mods, $src)> {
2733     let OtherPredicates = f64_preds;
2734   }
2735
2736   def : GCNPat<
2737     (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
2738     (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
2739     let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, NotHasTrue16BitInsts]);
2740   }
2741
2742   def : GCNPat<
2743     (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
2744     (V_MAX_F16_t16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
2745     let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, HasTrue16BitInsts]);
2746   }
2747
2748   def : GCNPat<
2749     (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
2750     (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> {
2751     // FIXME: Should have VOP3P subtarget predicate
2752     let OtherPredicates = f16_preds;
2753   }
2754 }
2755
2756 // On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal
2757 // mode, and would never flush. For f64, it's faster to do implement
2758 // this with a max. For f16/f32 it's a wash, but prefer max when
2759 // valid.
2760 //
2761 // FIXME: Lowering f32/f16 with max is worse since we can use a
2762 // smaller encoding if the input is fneg'd. It also adds an extra
2763 // register use.
2764 let SubtargetPredicate = HasMinMaxDenormModes in {
2765   defm : SelectCanonicalizeAsMax<[], [], []>;
2766 } // End SubtargetPredicate = HasMinMaxDenormModes
2767
2768 let SubtargetPredicate = NotHasMinMaxDenormModes in {
2769   // Use the max lowering if we don't need to flush.
2770
2771   // FIXME: We don't do use this for f32 as a workaround for the
2772   // library being compiled with the default ieee mode, but
2773   // potentially being called from flushing kernels. Really we should
2774   // not be mixing code expecting different default FP modes, but mul
2775   // works in any FP environment.
2776   defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>;
2777 } // End SubtargetPredicate = NotHasMinMaxDenormModes
2778
2779
2780 let OtherPredicates = [HasDLInsts] in {
2781 // Don't allow source modifiers. If there are any source modifiers then it's
2782 // better to select fma instead of fmac.
2783 def : GCNPat <
2784   (fma (f32 (VOP3NoMods f32:$src0)),
2785        (f32 (VOP3NoMods f32:$src1)),
2786        (f32 (VOP3NoMods f32:$src2))),
2787   (V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
2788                   SRCMODS.NONE, $src2)
2789 >;
2790 } // End OtherPredicates = [HasDLInsts]
2791
2792 let SubtargetPredicate = isGFX10Plus in {
2793 // Don't allow source modifiers. If there are any source modifiers then it's
2794 // better to select fma instead of fmac.
2795 let OtherPredicates = [NotHasTrue16BitInsts] in
2796 def : GCNPat <
2797   (fma (f16 (VOP3NoMods f32:$src0)),
2798        (f16 (VOP3NoMods f32:$src1)),
2799        (f16 (VOP3NoMods f32:$src2))),
2800   (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
2801                   SRCMODS.NONE, $src2)
2802 >;
2803 let OtherPredicates = [HasTrue16BitInsts] in
2804 def : GCNPat <
2805   (fma (f16 (VOP3NoMods f32:$src0)),
2806        (f16 (VOP3NoMods f32:$src1)),
2807        (f16 (VOP3NoMods f32:$src2))),
2808   (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
2809                   SRCMODS.NONE, $src2)
2810 >;
2811 }
2812
2813 let OtherPredicates = [HasFmacF64Inst] in
2814 // Don't allow source modifiers. If there are any source modifiers then it's
2815 // better to select fma instead of fmac.
2816 def : GCNPat <
2817   (fma (f64 (VOP3NoMods f64:$src0)),
2818        (f64 (VOP3NoMods f64:$src1)),
2819        (f64 (VOP3NoMods f64:$src2))),
2820   (V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
2821                   SRCMODS.NONE, $src2)
2822 >;
2823
2824 // COPY is workaround tablegen bug from multiple outputs
2825 // from S_LSHL_B32's multiple outputs from implicit scc def.
2826 let AddedComplexity = 1 in {
2827 def : GCNPat <
2828   (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))),
2829   (S_LSHL_B32 SReg_32:$src1, (i16 16))
2830 >;
2831
2832 def : GCNPat <
2833   (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
2834   (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
2835 >;
2836
2837
2838 def : GCNPat <
2839   (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
2840   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
2841 >;
2842
2843 def : GCNPat <
2844   (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
2845   (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
2846 >;
2847
2848 def : GCNPat <
2849   (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
2850   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
2851 >;
2852
2853 def : GCNPat <
2854   (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
2855   (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
2856 >;
2857
2858 def : GCNPat <
2859   (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 undef))),
2860   (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
2861 >;
2862
2863 def : GCNPat <
2864   (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 undef))),
2865   (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
2866 >;
2867
2868 def : GCNPat <
2869   (v2f16 (build_vector f16:$src0, (f16 undef))),
2870   (COPY $src0)
2871 >;
2872
2873 def : GCNPat <
2874   (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))),
2875   (S_LSHL_B32 SReg_32:$src1, (i32 16))
2876 >;
2877
2878 def : GCNPat <
2879   (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 VGPR_32:$src1))),
2880   (v2i16 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
2881 >;
2882
2883
2884 def : GCNPat <
2885   (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))),
2886   (S_LSHL_B32 SReg_32:$src1, (i32 16))
2887 >;
2888
2889 def : GCNPat <
2890   (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 VGPR_32:$src1))),
2891   (v2f16 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
2892 >;
2893 }
2894
2895 let SubtargetPredicate = HasVOP3PInsts in {
2896 def : GCNPat <
2897   (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
2898   (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
2899 >;
2900
2901 def : GCNPat <
2902   (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
2903   (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
2904 >;
2905
2906 // With multiple uses of the shift, this will duplicate the shift and
2907 // increase register pressure.
2908 def : GCNPat <
2909   (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
2910   (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1))
2911 >;
2912
2913 def : GCNPat <
2914   (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
2915                        (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
2916   (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1)
2917 >;
2918
2919 def : GCNPat <
2920   (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
2921   (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
2922 >;
2923
2924
2925
2926 foreach Ty = [i16, f16] in {
2927
2928 defvar vecTy = !if(!eq(Ty, i16), v2i16, v2f16);
2929 defvar immzeroTy = !if(!eq(Ty, i16), immzero, fpimmzero);
2930
2931 // Take the lower 16 bits from each VGPR_32 and concat them
2932 def : GCNPat <
2933   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
2934   (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
2935 >;
2936
2937
2938 // Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
2939 // Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
2940 def : GCNPat <
2941   (vecTy (DivergentBinFrag<build_vector> (Ty (immzeroTy)),
2942     (Ty !if(!eq(Ty, i16),
2943       (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
2944       (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
2945   (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b)
2946 >;
2947
2948
2949 // Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
2950 // Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
2951 def : GCNPat <
2952   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
2953     (Ty !if(!eq(Ty, i16),
2954       (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
2955       (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
2956   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)),  VGPR_32:$a, VGPR_32:$b)
2957 >;
2958
2959
2960 // Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
2961 // Special case, can use V_ALIGNBIT (always uses encoded literal)
2962 def : GCNPat <
2963   (vecTy (DivergentBinFrag<build_vector>
2964     (Ty !if(!eq(Ty, i16),
2965       (Ty (trunc (srl VGPR_32:$a, (i32 16)))),
2966       (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
2967     (Ty VGPR_32:$b))),
2968     (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))
2969 >;
2970
2971 // Take the upper 16 bits from each VGPR_32 and concat them
2972 def : GCNPat <
2973   (vecTy (DivergentBinFrag<build_vector>
2974     (Ty !if(!eq(Ty, i16),
2975       (Ty (trunc (srl VGPR_32:$a, (i32 16)))),
2976       (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
2977     (Ty !if(!eq(Ty, i16),
2978       (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
2979       (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
2980   (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302)))
2981 >;
2982
2983
2984 } // end foreach Ty
2985
2986
2987 let AddedComplexity = 5 in {
2988 def : GCNPat <
2989   (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
2990                                          (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
2991   (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
2992 >;
2993 }
2994 } // End SubtargetPredicate = HasVOP3PInsts
2995
2996 // With multiple uses of the shift, this will duplicate the shift and
2997 // increase register pressure.
2998 let SubtargetPredicate = isGFX11Plus in
2999 def : GCNPat <
3000   (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 SReg_32:$src1))),
3001   (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
3002 >;
3003
3004
3005 def : GCNPat <
3006   (v2f16 (scalar_to_vector f16:$src0)),
3007   (COPY $src0)
3008 >;
3009
3010 def : GCNPat <
3011   (v2i16 (scalar_to_vector i16:$src0)),
3012   (COPY $src0)
3013 >;
3014
3015 def : GCNPat <
3016   (v4i16 (scalar_to_vector i16:$src0)),
3017   (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
3018 >;
3019
3020 def : GCNPat <
3021   (v4f16 (scalar_to_vector f16:$src0)),
3022   (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
3023 >;
3024
3025 def : GCNPat <
3026   (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
3027                            timm:$bank_mask, timm:$bound_ctrl)),
3028   (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src,
3029                         (as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
3030                         (as_i32timm $bank_mask),
3031                         (as_i1timm $bound_ctrl))
3032 >;
3033
3034 def : GCNPat <
3035   (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
3036                               timm:$bank_mask, timm:$bound_ctrl)),
3037   (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl),
3038                         (as_i32timm $row_mask), (as_i32timm $bank_mask),
3039                         (as_i1timm $bound_ctrl))
3040 >;
3041
3042 //===----------------------------------------------------------------------===//
3043 // Fract Patterns
3044 //===----------------------------------------------------------------------===//
3045
3046 let SubtargetPredicate = isGFX6 in {
3047
3048 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
3049 // used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
3050 // way to implement it is using V_FRACT_F64.
3051 // The workaround for the V_FRACT bug is:
3052 //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3053
3054 // Convert floor(x) to (x - fract(x))
3055
3056 // Don't bother handling this for GlobalISel, it's handled during
3057 // lowering.
3058 //
3059 // FIXME: DAG should also custom lower this.
3060 def : GCNPat <
3061   (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
3062   (V_ADD_F64_e64
3063       $mods,
3064       $x,
3065       SRCMODS.NEG,
3066       (V_CNDMASK_B64_PSEUDO
3067          (V_MIN_F64_e64
3068              SRCMODS.NONE,
3069              (V_FRACT_F64_e64 $mods, $x),
3070              SRCMODS.NONE,
3071              (V_MOV_B64_PSEUDO 0x3fefffffffffffff)),
3072          $x,
3073          (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))))
3074 >;
3075
3076 } // End SubtargetPredicates = isGFX6
3077
3078 //============================================================================//
3079 // Miscellaneous Optimization Patterns
3080 //============================================================================//
3081
3082 // Undo sub x, c -> add x, -c canonicalization since c is more likely
3083 // an inline immediate than -c.
3084 // TODO: Also do for 64-bit.
3085 def : GCNPat<
3086   (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
3087   (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
3088 >;
3089
3090 def : GCNPat<
3091   (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
3092   (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
3093   let SubtargetPredicate = HasAddNoCarryInsts;
3094 }
3095
3096 def : GCNPat<
3097   (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
3098   (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
3099   let SubtargetPredicate = NotHasAddNoCarryInsts;
3100 }
3101
3102
3103 // Avoid pointlessly materializing a constant in VGPR.
3104 // FIXME: Should also do this for readlane, but tablegen crashes on
3105 // the ignored src1.
3106 def : GCNPat<
3107   (int_amdgcn_readfirstlane (i32 imm:$src)),
3108   (S_MOV_B32 SReg_32:$src)
3109 >;
3110
3111 multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
3112   def : GCNPat <
3113     (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
3114     (BFM $a, $b)
3115   >;
3116
3117   def : GCNPat <
3118     (vt (ADD (vt (shl 1, vt:$a)), -1)),
3119     (BFM $a, (i32 0))
3120   >;
3121 }
3122
3123 defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>;
3124 // FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>;
3125 defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>;
3126
3127 // Bitfield extract patterns
3128
3129 def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{
3130   return isMask_32(Imm);
3131 }]>;
3132
3133 def IMMPopCount : SDNodeXForm<imm, [{
3134   return CurDAG->getTargetConstant(llvm::popcount(N->getZExtValue()), SDLoc(N),
3135                                    MVT::i32);
3136 }]>;
3137
3138 def : AMDGPUPat <
3139   (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)),
3140                          IMMZeroBasedBitfieldMask:$mask),
3141   (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask)))
3142 >;
3143
3144 // x & ((1 << y) - 1)
3145 def : AMDGPUPat <
3146   (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
3147   (V_BFE_U32_e64 $src, (i32 0), $width)
3148 >;
3149
3150 // x & ~(-1 << y)
3151 def : AMDGPUPat <
3152   (DivergentBinFrag<and> i32:$src,
3153                          (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
3154   (V_BFE_U32_e64 $src, (i32 0), $width)
3155 >;
3156
3157 // x & (-1 >> (bitwidth - y))
3158 def : AMDGPUPat <
3159   (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
3160   (V_BFE_U32_e64 $src, (i32 0), $width)
3161 >;
3162
3163 // x << (bitwidth - y) >> (bitwidth - y)
3164 def : AMDGPUPat <
3165   (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)),
3166                          (sub 32, i32:$width)),
3167   (V_BFE_U32_e64 $src, (i32 0), $width)
3168 >;
3169
3170 def : AMDGPUPat <
3171   (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)),
3172                          (sub 32, i32:$width)),
3173   (V_BFE_I32_e64 $src, (i32 0), $width)
3174 >;
3175
3176 // SHA-256 Ma patterns
3177
3178 // ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
3179 def : AMDGPUPatIgnoreCopies <
3180   (DivergentBinFrag<or> (and i32:$x, i32:$z),
3181                         (and i32:$y, (or i32:$x, i32:$z))),
3182   (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32),
3183                                 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)),
3184                 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32),
3185                 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32))
3186 >;
3187
3188 def : AMDGPUPatIgnoreCopies <
3189   (DivergentBinFrag<or> (and i64:$x, i64:$z),
3190                         (and i64:$y, (or i64:$x, i64:$z))),
3191   (REG_SEQUENCE VReg_64,
3192     (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
3193                     (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))),
3194               (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)),
3195               (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0,
3196     (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
3197                     (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))),
3198               (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)),
3199               (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1)
3200 >;
3201
3202 multiclass IntMed3Pat<Instruction med3Inst,
3203                  SDPatternOperator min,
3204                  SDPatternOperator max> {
3205
3206   // This matches 16 permutations of
3207   // min(max(a, b), max(min(a, b), c))
3208   def : AMDGPUPat <
3209   (min (max i32:$src0, i32:$src1),
3210        (max (min i32:$src0, i32:$src1), i32:$src2)),
3211   (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
3212 >;
3213
3214   // This matches 16 permutations of
3215   // max(min(x, y), min(max(x, y), z))
3216   def : AMDGPUPat <
3217   (max (min i32:$src0, i32:$src1),
3218        (min (max i32:$src0, i32:$src1), i32:$src2)),
3219   (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
3220 >;
3221 }
3222
3223 defm : IntMed3Pat<V_MED3_I32_e64, smin, smax>;
3224 defm : IntMed3Pat<V_MED3_U32_e64, umin, umax>;
3225
3226 multiclass FPMed3Pat<ValueType vt,
3227                 Instruction med3Inst> {
3228   // This matches 16 permutations of max(min(x, y), min(max(x, y), z))
3229   def : GCNPat<
3230     (fmaxnum_like_nnan
3231       (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods),
3232                     (VOP3Mods vt:$src1, i32:$src1_mods)),
3233       (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods),
3234                                   (VOP3Mods vt:$src1, i32:$src1_mods)),
3235                     (vt (VOP3Mods vt:$src2, i32:$src2_mods)))),
3236     (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
3237               DSTCLAMP.NONE, DSTOMOD.NONE)>;
3238
3239
3240   // This matches 16 permutations of min(max(x, y), max(min(x, y), z))
3241   def : GCNPat<
3242     (fminnum_like_nnan
3243       (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods),
3244                     (VOP3Mods vt:$src1, i32:$src1_mods)),
3245       (fmaxnum_like (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods),
3246                                   (VOP3Mods vt:$src1, i32:$src1_mods)),
3247                     (vt (VOP3Mods vt:$src2, i32:$src2_mods)))),
3248     (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
3249               DSTCLAMP.NONE, DSTOMOD.NONE)>;
3250 }
3251
3252 class FP16Med3Pat<ValueType vt,
3253                 Instruction med3Inst> : GCNPat<
3254   (fmaxnum_like_nnan (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods),
3255                                    (VOP3Mods vt:$src1, i32:$src1_mods)),
3256            (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods),
3257                                        (VOP3Mods vt:$src1, i32:$src1_mods)),
3258                          (vt (VOP3Mods vt:$src2, i32:$src2_mods)))),
3259   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
3260 >;
3261
3262 multiclass Int16Med3Pat<Instruction med3Inst,
3263                         SDPatternOperator min,
3264                         SDPatternOperator max> {
3265   // This matches 16 permutations of
3266   // max(min(x, y), min(max(x, y), z))
3267   def : GCNPat <
3268   (max (min i16:$src0, i16:$src1),
3269        (min (max i16:$src0, i16:$src1), i16:$src2)),
3270   (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
3271 >;
3272
3273   // This matches 16 permutations of
3274   // min(max(a, b), max(min(a, b), c))
3275   def : GCNPat <
3276   (min (max i16:$src0, i16:$src1),
3277        (max (min i16:$src0, i16:$src1), i16:$src2)),
3278   (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
3279 >;
3280 }
3281
3282 defm : FPMed3Pat<f32, V_MED3_F32_e64>;
3283
3284 class
3285 IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max,
3286              SDPatternOperator max_or_min_oneuse> : AMDGPUPat <
3287   (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1),
3288                                 i32:$src2),
3289   (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
3290 >;
3291
3292 class
3293 FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
3294             SDPatternOperator max_or_min_oneuse> : GCNPat <
3295   (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods),
3296                                  (VOP3Mods vt:$src1, i32:$src1_mods)),
3297                (vt (VOP3Mods vt:$src2, i32:$src2_mods))),
3298   (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
3299               DSTCLAMP.NONE, DSTOMOD.NONE)
3300 >;
3301
3302 let OtherPredicates = [isGFX11Plus] in {
3303 def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>;
3304 def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>;
3305 def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>;
3306 def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>;
3307 def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
3308 def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
3309 def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
3310 def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
3311 }
3312
3313 let OtherPredicates = [isGFX9Plus] in {
3314 def : FP16Med3Pat<f16, V_MED3_F16_e64>;
3315 defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>;
3316 defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>;
3317 } // End Predicates = [isGFX9Plus]
3318
3319 class AMDGPUGenericInstruction : GenericInstruction {
3320   let Namespace = "AMDGPU";
3321 }
3322
3323 // Convert a wave address to a swizzled vector address (i.e. this is
3324 // for copying the stack pointer to a vector address appropriate to
3325 // use in the offset field of mubuf instructions).
3326 def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction {
3327   let OutOperandList = (outs type0:$dst);
3328   let InOperandList = (ins type0:$src);
3329   let hasSideEffects = 0;
3330 }
3331
3332 // Returns -1 if the input is zero.
3333 def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
3334   let OutOperandList = (outs type0:$dst);
3335   let InOperandList = (ins type1:$src);
3336   let hasSideEffects = 0;
3337 }
3338
3339 // Returns -1 if the input is zero.
3340 def G_AMDGPU_FFBL_B32 : AMDGPUGenericInstruction {
3341   let OutOperandList = (outs type0:$dst);
3342   let InOperandList = (ins type1:$src);
3343   let hasSideEffects = 0;
3344 }
3345
3346 def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction {
3347   let OutOperandList = (outs type0:$dst);
3348   let InOperandList = (ins type1:$src);
3349   let hasSideEffects = 0;
3350 }
3351
3352 class BufferLoadGenericInstruction : AMDGPUGenericInstruction {
3353   let OutOperandList = (outs type0:$dst);
3354   let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
3355                            type2:$soffset, untyped_imm_0:$offset,
3356                            untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
3357   let hasSideEffects = 0;
3358   let mayLoad = 1;
3359 }
3360
3361 class TBufferLoadGenericInstruction : AMDGPUGenericInstruction {
3362   let OutOperandList = (outs type0:$dst);
3363   let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
3364                            type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format,
3365                            untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
3366   let hasSideEffects = 0;
3367   let mayLoad = 1;
3368 }
3369
3370 def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction;
3371 def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
3372 def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
3373 def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
3374 def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
3375 def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
3376 def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction;
3377 def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
3378 def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
3379 def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
3380
3381 class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
3382   let OutOperandList = (outs);
3383   let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
3384                            type2:$soffset, untyped_imm_0:$offset,
3385                            untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
3386   let hasSideEffects = 0;
3387   let mayStore = 1;
3388 }
3389
3390 class TBufferStoreGenericInstruction : AMDGPUGenericInstruction {
3391   let OutOperandList = (outs);
3392   let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
3393                            type2:$soffset, untyped_imm_0:$offset,
3394                            untyped_imm_0:$format,
3395                            untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
3396   let hasSideEffects = 0;
3397   let mayStore = 1;
3398 }
3399
3400 def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction;
3401 def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction;
3402 def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction;
3403 def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction;
3404 def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction;
3405 def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction;
3406 def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction;
3407
3408 def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction {
3409   let OutOperandList = (outs type0:$dst);
3410   let InOperandList = (ins type0:$src0, type0:$src1);
3411   let hasSideEffects = 0;
3412 }
3413
3414 def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction {
3415   let OutOperandList = (outs type0:$dst);
3416   let InOperandList = (ins type0:$src0, type0:$src1);
3417   let hasSideEffects = 0;
3418 }
3419
3420 foreach N = 0-3 in {
3421 def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
3422   let OutOperandList = (outs type0:$dst);
3423   let InOperandList = (ins type0:$src0);
3424   let hasSideEffects = 0;
3425 }
3426 }
3427
3428 def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction {
3429   let OutOperandList = (outs type0:$dst);
3430   let InOperandList = (ins type0:$src0, type0:$src1);
3431   let hasSideEffects = 0;
3432 }
3433
3434 def G_AMDGPU_SMED3 : AMDGPUGenericInstruction {
3435   let OutOperandList = (outs type0:$dst);
3436   let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
3437   let hasSideEffects = 0;
3438 }
3439
3440 def G_AMDGPU_UMED3 : AMDGPUGenericInstruction {
3441   let OutOperandList = (outs type0:$dst);
3442   let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
3443   let hasSideEffects = 0;
3444 }
3445
3446 def G_AMDGPU_FMED3 : AMDGPUGenericInstruction {
3447   let OutOperandList = (outs type0:$dst);
3448   let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
3449   let hasSideEffects = 0;
3450 }
3451
3452 def G_AMDGPU_CLAMP : AMDGPUGenericInstruction {
3453   let OutOperandList = (outs type0:$dst);
3454   let InOperandList = (ins type0:$src);
3455   let hasSideEffects = 0;
3456 }
3457
3458 // Integer multiply-add: arg0 * arg1 + arg2.
3459 //
3460 // arg0 and arg1 are 32-bit integers (interpreted as signed or unsigned),
3461 // arg2 is a 64-bit integer. Result is a 64-bit integer and a 1-bit carry-out.
3462 class G_AMDGPU_MAD_64_32 : AMDGPUGenericInstruction {
3463   let OutOperandList = (outs type0:$dst, type1:$carry_out);
3464   let InOperandList = (ins type2:$arg0, type2:$arg1, type0:$arg2);
3465   let hasSideEffects = 0;
3466 }
3467
3468 def G_AMDGPU_MAD_U64_U32 : G_AMDGPU_MAD_64_32;
3469 def G_AMDGPU_MAD_I64_I32 : G_AMDGPU_MAD_64_32;
3470
3471 // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
3472 // operand Expects a MachineMemOperand in addition to explicit
3473 // operands.
3474 def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
3475   let OutOperandList = (outs type0:$oldval);
3476   let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval);
3477   let hasSideEffects = 0;
3478   let mayLoad = 1;
3479   let mayStore = 1;
3480 }
3481
3482 let Namespace = "AMDGPU" in {
3483 def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
3484 def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
3485 }
3486
3487 class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction {
3488   let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst));
3489   let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
3490                            type2:$soffset, untyped_imm_0:$offset,
3491                            untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
3492   let hasSideEffects = 0;
3493   let mayLoad = 1;
3494   let mayStore = 1;
3495 }
3496
3497 def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction;
3498 def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction;
3499 def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction;
3500 def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction;
3501 def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction;
3502 def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
3503 def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
3504 def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
3505 def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
3506 def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
3507 def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
3508 def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
3509 def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
3510 def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
3511 def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;
3512
3513 def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
3514   let OutOperandList = (outs type0:$dst);
3515   let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex,
3516                            type2:$voffset, type2:$soffset, untyped_imm_0:$offset,
3517                            untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
3518   let hasSideEffects = 0;
3519   let mayLoad = 1;
3520   let mayStore = 1;
3521 }
3522
3523 // Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as
3524 // a workaround for the intrinsic being defined as readnone, but
3525 // really needs a memory operand.
3526 def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
3527   let OutOperandList = (outs type0:$dst);
3528   let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy);
3529   let hasSideEffects = 0;
3530   let mayLoad = 1;
3531   let mayStore = 0;
3532 }
3533
3534 // This is equivalent to the G_INTRINSIC*, but the operands may have
3535 // been legalized depending on the subtarget requirements.
3536 def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
3537   let OutOperandList = (outs type0:$dst);
3538   let InOperandList = (ins unknown:$intrin, variable_ops);
3539   let hasSideEffects = 0;
3540   let mayLoad = 1;
3541
3542   // FIXME: Use separate opcode for atomics.
3543   let mayStore = 1;
3544 }
3545
3546 def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction {
3547   let OutOperandList = (outs type0:$dst);
3548   let InOperandList = (ins unknown:$intrin, variable_ops);
3549   let hasSideEffects = 0;
3550   let mayLoad = 1;
3551
3552   // FIXME: Use separate opcode for atomics.
3553   let mayStore = 1;
3554 }
3555
3556 // This is equivalent to the G_INTRINSIC*, but the operands may have
3557 // been legalized depending on the subtarget requirements.
3558 def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
3559   let OutOperandList = (outs);
3560   let InOperandList = (ins unknown:$intrin, variable_ops);
3561   let hasSideEffects = 0;
3562   let mayStore = 1;
3563 }
3564
3565 def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction {
3566   let OutOperandList = (outs);
3567   let InOperandList = (ins unknown:$intrin, variable_ops);
3568   let hasSideEffects = 0;
3569   let mayStore = 1;
3570 }
3571
3572 def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
3573   let OutOperandList = (outs type0:$dst);
3574   let InOperandList = (ins unknown:$intrin, variable_ops);
3575   let hasSideEffects = 0;
3576   let mayLoad = 1;
3577   let mayStore = 0;
3578 }
3579
3580 // Generic instruction for SI_CALL, so we can select the register bank and insert a waterfall loop
3581 // if necessary.
3582 def G_SI_CALL : AMDGPUGenericInstruction {
3583   let OutOperandList = (outs SReg_64:$dst);
3584   let InOperandList = (ins type0:$src0, unknown:$callee);
3585   let Size = 4;
3586   let isCall = 1;
3587   let UseNamedOperandTable = 1;
3588   let SchedRW = [WriteBranch];
3589   // TODO: Should really base this on the call target
3590   let isConvergent = 1;
3591 }
3592
3593 def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction {
3594   let OutOperandList = (outs type0:$vdst);
3595   let InOperandList = (ins type1:$src0);
3596   let hasSideEffects = 0;
3597 }
3598
3599 def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction {
3600   let OutOperandList = (outs type0:$vdst);
3601   let InOperandList = (ins type1:$src0);
3602   let hasSideEffects = 0;
3603 }
3604
3605 //============================================================================//
3606 // Dummy Instructions
3607 //============================================================================//
3608
3609 def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
3610   let Inst{31-0} = 0x00000000;
3611   let FixedSize = 1;
3612   let Size = 4;
3613   let Uses = [EXEC];
3614   let hasSideEffects = 1;
3615   let SubtargetPredicate = isGFX10Plus;
3616 }