1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file describes the X86 SSE instruction set, defining the instructions,
10 // and properties of the instructions which are needed for code generation,
11 // machine code emission, and analysis.
13 //===----------------------------------------------------------------------===//
15 //===----------------------------------------------------------------------===//
16 // SSE 1 & 2 Instructions Classes
17 //===----------------------------------------------------------------------===//
19 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
21 RegisterClass RC, X86MemOperand x86memop,
22 Domain d, X86FoldableSchedWrite sched,
24 let isCodeGenOnly = 1 in {
25 let isCommutable = 1 in {
26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38 Sched<[sched.Folded, sched.ReadAfterFold]>;
42 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
44 SDPatternOperator OpNode, RegisterClass RC,
45 ValueType VT, string asm, Operand memopr,
46 ComplexPattern mem_cpat, Domain d,
47 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48 let hasSideEffects = 0 in {
49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60 [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
61 Sched<[sched.Folded, sched.ReadAfterFold]>;
65 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
66 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
67 RegisterClass RC, ValueType vt,
68 X86MemOperand x86memop, PatFrag mem_frag,
69 Domain d, X86FoldableSchedWrite sched,
71 let isCommutable = 1 in
72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
85 Sched<[sched.Folded, sched.ReadAfterFold]>;
88 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90 string OpcodeStr, X86MemOperand x86memop,
91 X86FoldableSchedWrite sched,
92 list<dag> pat_rr, list<dag> pat_rm,
94 let isCommutable = 1, hasSideEffects = 0 in
95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
101 let hasSideEffects = 0, mayLoad = 1 in
102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
107 Sched<[sched.Folded, sched.ReadAfterFold]>;
111 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112 // This is expanded by ExpandPostRAPseudos.
113 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114 isPseudo = 1, SchedRW = [WriteZero] in {
115 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
116 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
117 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
118 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
119 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
120 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
123 //===----------------------------------------------------------------------===//
124 // AVX & SSE - Zero/One Vectors
125 //===----------------------------------------------------------------------===//
127 // Alias instruction that maps zero vector to pxor / xorp* for sse.
128 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
129 // swizzled by ExecutionDomainFix to pxor.
130 // We set canFoldAsLoad because this can be converted to a constant-pool
131 // load of an all-zeros value if folding it would be beneficial.
132 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
133 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
134 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
135 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
138 let Predicates = [NoAVX512] in {
139 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
140 def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
141 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
142 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
143 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
147 // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
148 // and doesn't need it because on sandy bridge the register is set to zero
149 // at the rename stage without using any execution unit, so SET0PSY
150 // and SET0PDY can be used for vector int instructions without penalty
151 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
152 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
153 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
154 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
157 let Predicates = [NoAVX512] in {
158 def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
159 def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
160 def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
161 def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
162 def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
165 // We set canFoldAsLoad because this can be converted to a constant-pool
166 // load of an all-ones value if folding it would be beneficial.
167 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
168 isPseudo = 1, SchedRW = [WriteZero] in {
169 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
170 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
171 let Predicates = [HasAVX1Only, OptForMinSize] in {
172 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
173 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
175 let Predicates = [HasAVX2] in
176 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
177 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
180 //===----------------------------------------------------------------------===//
181 // SSE 1 & 2 - Move FP Scalar Instructions
183 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
184 // register copies because it's a partial register update; Register-to-register
185 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
186 // that the insert be implementable in terms of a copy, and just mentioned, we
187 // don't use movss/movsd for copies.
188 //===----------------------------------------------------------------------===//
190 multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
191 X86MemOperand x86memop, string base_opc,
192 string asm_opr, Domain d, string Name> {
193 let isCommutable = 1 in
194 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
195 (ins VR128:$src1, VR128:$src2),
196 !strconcat(base_opc, asm_opr),
197 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
198 Sched<[SchedWriteFShuffle.XMM]>;
200 // For the disassembler
201 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
202 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
203 (ins VR128:$src1, VR128:$src2),
204 !strconcat(base_opc, asm_opr), []>,
205 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
208 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
209 X86MemOperand x86memop, string OpcodeStr,
210 Domain d, string Name, Predicate pred> {
212 let Predicates = [UseAVX, OptForSize] in
213 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
214 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
216 VEX_4V, VEX_LIG, VEX_WIG;
218 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
219 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
220 [(store RC:$src, addr:$dst)], d>,
221 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
223 let Constraints = "$src1 = $dst" in {
224 let Predicates = [pred, NoSSE41_Or_OptForSize] in
225 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
226 "\t{$src2, $dst|$dst, $src2}", d, Name>;
229 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
230 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
231 [(store RC:$src, addr:$dst)], d>,
232 Sched<[WriteFStore]>;
234 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
235 (!cast<Instruction>("V"#NAME#"rr_REV")
236 VR128:$dst, VR128:$src1, VR128:$src2), 0>;
237 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
238 (!cast<Instruction>(NAME#"rr_REV")
239 VR128:$dst, VR128:$src2), 0>;
242 // Loading from memory automatically zeroing upper bits.
243 multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
244 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
246 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
248 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
249 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
250 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
251 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
252 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
255 // _alt version uses FR32/FR64 register class.
256 let isCodeGenOnly = 1 in {
257 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
258 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
259 [(set RC:$dst, (mem_pat addr:$src))], d>,
260 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
261 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
262 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
263 [(set RC:$dst, (mem_pat addr:$src))], d>,
268 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
269 SSEPackedSingle, "MOVSS", UseSSE1>, XS;
270 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
271 SSEPackedDouble, "MOVSD", UseSSE2>, XD;
273 let canFoldAsLoad = 1, isReMaterializable = 1 in {
274 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
275 SSEPackedSingle>, XS;
276 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
277 SSEPackedDouble>, XD;
281 let Predicates = [UseAVX] in {
282 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
283 (VMOVSSrm addr:$src)>;
284 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
285 (VMOVSDrm addr:$src)>;
287 // Represent the same patterns above but in the form they appear for
289 def : Pat<(v8f32 (X86vzload32 addr:$src)),
290 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
291 def : Pat<(v4f64 (X86vzload64 addr:$src)),
292 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
295 let Predicates = [UseAVX, OptForSize] in {
296 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
297 // MOVSS to the lower bits.
298 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
299 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
300 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
301 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
303 // Move low f32 and clear high bits.
304 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
305 (SUBREG_TO_REG (i32 0),
306 (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
307 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
308 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
309 (SUBREG_TO_REG (i32 0),
310 (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
311 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
314 let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
315 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
316 // MOVSS to the lower bits.
317 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
318 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
319 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
320 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
323 let Predicates = [UseSSE2] in
324 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
325 (MOVSDrm addr:$src)>;
327 let Predicates = [UseSSE1] in
328 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
329 (MOVSSrm addr:$src)>;
331 //===----------------------------------------------------------------------===//
332 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
333 //===----------------------------------------------------------------------===//
335 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
336 X86MemOperand x86memop, PatFrag ld_frag,
337 string asm, Domain d,
338 X86SchedWriteMoveLS sched> {
339 let hasSideEffects = 0, isMoveReg = 1 in
340 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
341 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
343 let canFoldAsLoad = 1, isReMaterializable = 1 in
344 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
345 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
346 [(set RC:$dst, (ld_frag addr:$src))], d>,
350 let Predicates = [HasAVX, NoVLX] in {
351 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
352 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
354 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
355 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
357 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
358 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
360 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
361 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
364 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
365 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
366 PS, VEX, VEX_L, VEX_WIG;
367 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
368 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
369 PD, VEX, VEX_L, VEX_WIG;
370 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
371 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
372 PS, VEX, VEX_L, VEX_WIG;
373 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
374 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
375 PD, VEX, VEX_L, VEX_WIG;
378 let Predicates = [UseSSE1] in {
379 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
380 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
382 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
383 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
386 let Predicates = [UseSSE2] in {
387 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
388 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
390 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
391 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
395 let Predicates = [HasAVX, NoVLX] in {
396 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
397 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
398 "movaps\t{$src, $dst|$dst, $src}",
399 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
401 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
402 "movapd\t{$src, $dst|$dst, $src}",
403 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
405 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
406 "movups\t{$src, $dst|$dst, $src}",
407 [(store (v4f32 VR128:$src), addr:$dst)]>,
409 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
410 "movupd\t{$src, $dst|$dst, $src}",
411 [(store (v2f64 VR128:$src), addr:$dst)]>,
415 let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
416 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
417 "movaps\t{$src, $dst|$dst, $src}",
418 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
420 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
421 "movapd\t{$src, $dst|$dst, $src}",
422 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
424 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
425 "movups\t{$src, $dst|$dst, $src}",
426 [(store (v8f32 VR256:$src), addr:$dst)]>,
428 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
429 "movupd\t{$src, $dst|$dst, $src}",
430 [(store (v4f64 VR256:$src), addr:$dst)]>,
436 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
438 let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
439 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
441 "movaps\t{$src, $dst|$dst, $src}", []>,
442 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
443 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
445 "movapd\t{$src, $dst|$dst, $src}", []>,
446 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
447 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
449 "movups\t{$src, $dst|$dst, $src}", []>,
450 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
451 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
453 "movupd\t{$src, $dst|$dst, $src}", []>,
454 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
457 let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
458 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
460 "movaps\t{$src, $dst|$dst, $src}", []>,
461 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
462 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
464 "movapd\t{$src, $dst|$dst, $src}", []>,
465 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
466 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
468 "movups\t{$src, $dst|$dst, $src}", []>,
469 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
470 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
472 "movupd\t{$src, $dst|$dst, $src}", []>,
473 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
477 // Reversed version with ".s" suffix for GAS compatibility.
478 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
479 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
480 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
481 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
482 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
483 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
484 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
485 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
486 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
487 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
488 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
489 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
490 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
491 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
492 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
493 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
495 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
496 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
497 "movaps\t{$src, $dst|$dst, $src}",
498 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
499 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
500 "movapd\t{$src, $dst|$dst, $src}",
501 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
502 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
503 "movups\t{$src, $dst|$dst, $src}",
504 [(store (v4f32 VR128:$src), addr:$dst)]>;
505 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
506 "movupd\t{$src, $dst|$dst, $src}",
507 [(store (v2f64 VR128:$src), addr:$dst)]>;
511 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
512 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
513 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
514 "movaps\t{$src, $dst|$dst, $src}", []>,
515 FoldGenData<"MOVAPSrr">;
516 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
517 "movapd\t{$src, $dst|$dst, $src}", []>,
518 FoldGenData<"MOVAPDrr">;
519 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520 "movups\t{$src, $dst|$dst, $src}", []>,
521 FoldGenData<"MOVUPSrr">;
522 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
523 "movupd\t{$src, $dst|$dst, $src}", []>,
524 FoldGenData<"MOVUPDrr">;
527 // Reversed version with ".s" suffix for GAS compatibility.
528 def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
529 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
530 def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
531 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
532 def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
533 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
534 def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
535 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
537 let Predicates = [HasAVX, NoVLX] in {
538 // 256-bit load/store need to use floating point load/store in case we don't
539 // have AVX2. Execution domain fixing will convert to integer if AVX2 is
540 // available and changing the domain is beneficial.
541 def : Pat<(alignedloadv4i64 addr:$src),
542 (VMOVAPSYrm addr:$src)>;
543 def : Pat<(alignedloadv8i32 addr:$src),
544 (VMOVAPSYrm addr:$src)>;
545 def : Pat<(alignedloadv16i16 addr:$src),
546 (VMOVAPSYrm addr:$src)>;
547 def : Pat<(alignedloadv32i8 addr:$src),
548 (VMOVAPSYrm addr:$src)>;
549 def : Pat<(loadv4i64 addr:$src),
550 (VMOVUPSYrm addr:$src)>;
551 def : Pat<(loadv8i32 addr:$src),
552 (VMOVUPSYrm addr:$src)>;
553 def : Pat<(loadv16i16 addr:$src),
554 (VMOVUPSYrm addr:$src)>;
555 def : Pat<(loadv32i8 addr:$src),
556 (VMOVUPSYrm addr:$src)>;
558 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
559 (VMOVAPSYmr addr:$dst, VR256:$src)>;
560 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
561 (VMOVAPSYmr addr:$dst, VR256:$src)>;
562 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
563 (VMOVAPSYmr addr:$dst, VR256:$src)>;
564 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
565 (VMOVAPSYmr addr:$dst, VR256:$src)>;
566 def : Pat<(store (v4i64 VR256:$src), addr:$dst),
567 (VMOVUPSYmr addr:$dst, VR256:$src)>;
568 def : Pat<(store (v8i32 VR256:$src), addr:$dst),
569 (VMOVUPSYmr addr:$dst, VR256:$src)>;
570 def : Pat<(store (v16i16 VR256:$src), addr:$dst),
571 (VMOVUPSYmr addr:$dst, VR256:$src)>;
572 def : Pat<(store (v32i8 VR256:$src), addr:$dst),
573 (VMOVUPSYmr addr:$dst, VR256:$src)>;
576 // Use movaps / movups for SSE integer load / store (one byte shorter).
577 // The instructions selected below are then converted to MOVDQA/MOVDQU
578 // during the SSE domain pass.
579 let Predicates = [UseSSE1] in {
580 def : Pat<(alignedloadv2i64 addr:$src),
581 (MOVAPSrm addr:$src)>;
582 def : Pat<(alignedloadv4i32 addr:$src),
583 (MOVAPSrm addr:$src)>;
584 def : Pat<(alignedloadv8i16 addr:$src),
585 (MOVAPSrm addr:$src)>;
586 def : Pat<(alignedloadv16i8 addr:$src),
587 (MOVAPSrm addr:$src)>;
588 def : Pat<(loadv2i64 addr:$src),
589 (MOVUPSrm addr:$src)>;
590 def : Pat<(loadv4i32 addr:$src),
591 (MOVUPSrm addr:$src)>;
592 def : Pat<(loadv8i16 addr:$src),
593 (MOVUPSrm addr:$src)>;
594 def : Pat<(loadv16i8 addr:$src),
595 (MOVUPSrm addr:$src)>;
597 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
598 (MOVAPSmr addr:$dst, VR128:$src)>;
599 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
600 (MOVAPSmr addr:$dst, VR128:$src)>;
601 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
602 (MOVAPSmr addr:$dst, VR128:$src)>;
603 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
604 (MOVAPSmr addr:$dst, VR128:$src)>;
605 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
606 (MOVUPSmr addr:$dst, VR128:$src)>;
607 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
608 (MOVUPSmr addr:$dst, VR128:$src)>;
609 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
610 (MOVUPSmr addr:$dst, VR128:$src)>;
611 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
612 (MOVUPSmr addr:$dst, VR128:$src)>;
615 //===----------------------------------------------------------------------===//
616 // SSE 1 & 2 - Move Low packed FP Instructions
617 //===----------------------------------------------------------------------===//
619 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
620 string base_opc, string asm_opr> {
621 // No pattern as they need be special cased between high and low.
622 let hasSideEffects = 0, mayLoad = 1 in
623 def PSrm : PI<opc, MRMSrcMem,
624 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
625 !strconcat(base_opc, "s", asm_opr),
626 [], SSEPackedSingle>, PS,
627 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
629 def PDrm : PI<opc, MRMSrcMem,
630 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
631 !strconcat(base_opc, "d", asm_opr),
632 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
633 (scalar_to_vector (loadf64 addr:$src2)))))],
634 SSEPackedDouble>, PD,
635 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
638 multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
640 let Predicates = [UseAVX] in
641 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
642 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
645 let Constraints = "$src1 = $dst" in
646 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
647 "\t{$src2, $dst|$dst, $src2}">;
650 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
652 let SchedRW = [WriteFStore] in {
653 let Predicates = [UseAVX] in {
654 let mayStore = 1, hasSideEffects = 0 in
655 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
656 "movlps\t{$src, $dst|$dst, $src}",
659 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
660 "movlpd\t{$src, $dst|$dst, $src}",
661 [(store (f64 (extractelt (v2f64 VR128:$src),
662 (iPTR 0))), addr:$dst)]>,
665 let mayStore = 1, hasSideEffects = 0 in
666 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
667 "movlps\t{$src, $dst|$dst, $src}",
669 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
670 "movlpd\t{$src, $dst|$dst, $src}",
671 [(store (f64 (extractelt (v2f64 VR128:$src),
672 (iPTR 0))), addr:$dst)]>;
675 let Predicates = [UseSSE1] in {
676 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
677 // end up with a movsd or blend instead of shufp.
678 // No need for aligned load, we're only loading 64-bits.
679 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
681 (MOVLPSrm VR128:$src1, addr:$src2)>;
682 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
683 (MOVLPSrm VR128:$src1, addr:$src2)>;
685 def : Pat<(v4f32 (X86vzload64 addr:$src)),
686 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
687 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
688 (MOVLPSmr addr:$dst, VR128:$src)>;
691 //===----------------------------------------------------------------------===//
692 // SSE 1 & 2 - Move Hi packed FP Instructions
693 //===----------------------------------------------------------------------===//
695 defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
697 let SchedRW = [WriteFStore] in {
698 // v2f64 extract element 1 is always custom lowered to unpack high to low
699 // and extract element 0 so the non-store version isn't too horrible.
700 let Predicates = [UseAVX] in {
701 let mayStore = 1, hasSideEffects = 0 in
702 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
703 "movhps\t{$src, $dst|$dst, $src}",
705 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
706 "movhpd\t{$src, $dst|$dst, $src}",
707 [(store (f64 (extractelt
708 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
709 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
711 let mayStore = 1, hasSideEffects = 0 in
712 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
713 "movhps\t{$src, $dst|$dst, $src}",
715 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
716 "movhpd\t{$src, $dst|$dst, $src}",
717 [(store (f64 (extractelt
718 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
719 (iPTR 0))), addr:$dst)]>;
722 let Predicates = [UseAVX] in {
723 // Also handle an i64 load because that may get selected as a faster way to
725 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
726 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
727 (VMOVHPDrm VR128:$src1, addr:$src2)>;
728 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
729 (VMOVHPDrm VR128:$src1, addr:$src2)>;
731 def : Pat<(store (f64 (extractelt
732 (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
733 (iPTR 0))), addr:$dst),
734 (VMOVHPDmr addr:$dst, VR128:$src)>;
737 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
738 (VMOVLPDrm VR128:$src1, addr:$src2)>;
741 let Predicates = [UseSSE1] in {
742 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
743 // end up with a movsd or blend instead of shufp.
744 // No need for aligned load, we're only loading 64-bits.
745 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
746 (MOVHPSrm VR128:$src1, addr:$src2)>;
747 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
748 (MOVHPSrm VR128:$src1, addr:$src2)>;
750 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
752 (MOVHPSmr addr:$dst, VR128:$src)>;
755 let Predicates = [UseSSE2] in {
758 // Also handle an i64 load because that may get selected as a faster way to
760 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
761 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
762 (MOVHPDrm VR128:$src1, addr:$src2)>;
763 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
764 (MOVHPDrm VR128:$src1, addr:$src2)>;
766 def : Pat<(store (f64 (extractelt
767 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
768 (iPTR 0))), addr:$dst),
769 (MOVHPDmr addr:$dst, VR128:$src)>;
772 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
773 (MOVLPDrm VR128:$src1, addr:$src2)>;
776 let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
777 // Use MOVLPD to load into the low bits from a full vector unless we can use
779 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
780 (MOVLPDrm VR128:$src1, addr:$src2)>;
783 //===----------------------------------------------------------------------===//
784 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
785 //===----------------------------------------------------------------------===//
787 let Predicates = [UseAVX] in {
788 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
789 (ins VR128:$src1, VR128:$src2),
790 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
792 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
793 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
794 let isCommutable = 1 in
795 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
796 (ins VR128:$src1, VR128:$src2),
797 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
799 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
800 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
803 let Constraints = "$src1 = $dst" in {
804 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
805 (ins VR128:$src1, VR128:$src2),
806 "movlhps\t{$src2, $dst|$dst, $src2}",
808 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
809 Sched<[SchedWriteFShuffle.XMM]>;
810 let isCommutable = 1 in
811 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
812 (ins VR128:$src1, VR128:$src2),
813 "movhlps\t{$src2, $dst|$dst, $src2}",
815 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
816 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
819 //===----------------------------------------------------------------------===//
820 // SSE 1 & 2 - Conversion Instructions
821 //===----------------------------------------------------------------------===//
823 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
824 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
825 string asm, string mem, X86FoldableSchedWrite sched,
827 SchedRead Int2Fpu = ReadDefault> {
828 let ExeDomain = d in {
829 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
830 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
831 [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
832 Sched<[sched, Int2Fpu]>;
833 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
834 mem#"\t{$src, $dst|$dst, $src}",
835 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
836 Sched<[sched.Folded]>;
840 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
841 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
842 string asm, Domain d, X86FoldableSchedWrite sched> {
843 let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
844 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
845 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
848 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
849 [(set RC:$dst, (DstTy (any_sint_to_fp
850 (SrcTy (ld_frag addr:$src)))))], d>,
851 Sched<[sched.Folded]>;
855 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
856 X86MemOperand x86memop, string asm, string mem,
857 X86FoldableSchedWrite sched, Domain d> {
858 let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
859 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
860 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
861 Sched<[sched, ReadDefault, ReadInt2Fpu]>;
863 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
864 (ins DstRC:$src1, x86memop:$src),
865 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
866 Sched<[sched.Folded, sched.ReadAfterFold]>;
867 } // hasSideEffects = 0
870 let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
871 defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
872 "cvttss2si", "cvttss2si",
873 WriteCvtSS2I, SSEPackedSingle>,
875 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
876 "cvttss2si", "cvttss2si",
877 WriteCvtSS2I, SSEPackedSingle>,
878 XS, VEX, VEX_W, VEX_LIG;
879 defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
880 "cvttsd2si", "cvttsd2si",
881 WriteCvtSD2I, SSEPackedDouble>,
883 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
884 "cvttsd2si", "cvttsd2si",
885 WriteCvtSD2I, SSEPackedDouble>,
886 XD, VEX, VEX_W, VEX_LIG;
889 // The assembler can recognize rr 64-bit instructions by seeing a rxx
890 // register, but the same isn't true when only using memory operands,
891 // provide other assembly "l" and "q" forms to address this explicitly
892 // where appropriate to do so.
893 let isCodeGenOnly = 1 in {
894 defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
895 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
897 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
898 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
899 VEX_W, VEX_LIG, SIMD_EXC;
900 defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
901 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
903 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
904 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
905 VEX_W, VEX_LIG, SIMD_EXC;
906 } // isCodeGenOnly = 1
908 let Predicates = [UseAVX] in {
909 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
910 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
911 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
912 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
913 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
914 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
915 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
916 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
918 def : Pat<(f32 (any_sint_to_fp GR32:$src)),
919 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
920 def : Pat<(f32 (any_sint_to_fp GR64:$src)),
921 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
922 def : Pat<(f64 (any_sint_to_fp GR32:$src)),
923 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
924 def : Pat<(f64 (any_sint_to_fp GR64:$src)),
925 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
928 let isCodeGenOnly = 1 in {
929 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
930 "cvttss2si", "cvttss2si",
931 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
932 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
933 "cvttss2si", "cvttss2si",
934 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
935 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
936 "cvttsd2si", "cvttsd2si",
937 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
938 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
939 "cvttsd2si", "cvttsd2si",
940 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
941 defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
942 "cvtsi2ss", "cvtsi2ss{l}",
943 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
944 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
945 "cvtsi2ss", "cvtsi2ss{q}",
946 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
947 defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
948 "cvtsi2sd", "cvtsi2sd{l}",
949 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
950 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
951 "cvtsi2sd", "cvtsi2sd{q}",
952 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
953 } // isCodeGenOnly = 1
955 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
956 // and/or XMM operand(s).
958 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
959 ValueType DstVT, ValueType SrcVT, SDNode OpNode,
960 Operand memop, ComplexPattern mem_cpat, string asm,
961 X86FoldableSchedWrite sched, Domain d> {
962 let ExeDomain = d in {
963 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
964 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
965 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
967 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
968 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
969 [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
970 Sched<[sched.Folded]>;
974 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
975 RegisterClass DstRC, X86MemOperand x86memop,
976 string asm, string mem, X86FoldableSchedWrite sched,
977 Domain d, bit Is2Addr = 1> {
978 let hasSideEffects = 0, ExeDomain = d in {
979 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
981 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
982 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
983 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
985 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
986 (ins DstRC:$src1, x86memop:$src2),
988 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
989 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
990 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
994 let Uses = [MXCSR], mayRaiseFPException = 1 in {
995 let Predicates = [UseAVX] in {
996 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
997 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
998 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
999 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1000 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1001 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
1003 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1004 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1005 SSEPackedDouble>, XD;
1006 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1007 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1008 SSEPackedDouble>, XD, REX_W;
1011 let Predicates = [UseAVX] in {
1012 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1013 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1014 XS, VEX_4V, VEX_LIG, SIMD_EXC;
1015 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1016 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1017 XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1018 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1019 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1020 XD, VEX_4V, VEX_LIG;
1021 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1022 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1023 XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1025 let Constraints = "$src1 = $dst" in {
1026 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1027 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1029 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1030 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1031 XS, REX_W, SIMD_EXC;
1032 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1033 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1035 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1036 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1037 XD, REX_W, SIMD_EXC;
1040 def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1041 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1042 def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1043 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1044 def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1045 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1046 def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1047 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1049 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1050 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1051 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1052 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1054 def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1055 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1056 def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1057 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1058 def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1059 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1060 def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1061 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1063 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1064 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1065 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1066 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1070 // Aliases for intrinsics
1071 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1072 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1073 ssmem, sse_load_f32, "cvttss2si",
1074 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1075 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1076 X86cvtts2Int, ssmem, sse_load_f32,
1077 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1078 XS, VEX, VEX_LIG, VEX_W;
1079 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1080 sdmem, sse_load_f64, "cvttsd2si",
1081 WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1082 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1083 X86cvtts2Int, sdmem, sse_load_f64,
1084 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1085 XD, VEX, VEX_LIG, VEX_W;
1087 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1088 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1089 ssmem, sse_load_f32, "cvttss2si",
1090 WriteCvtSS2I, SSEPackedSingle>, XS;
1091 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1092 X86cvtts2Int, ssmem, sse_load_f32,
1093 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1095 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1096 sdmem, sse_load_f64, "cvttsd2si",
1097 WriteCvtSD2I, SSEPackedDouble>, XD;
1098 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1099 X86cvtts2Int, sdmem, sse_load_f64,
1100 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1104 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1105 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1106 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1107 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1108 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1109 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1110 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1111 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1112 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1113 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1114 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1115 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1116 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1117 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1118 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1119 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1121 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1122 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1123 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1124 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1125 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1126 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1127 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1128 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1129 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1130 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1131 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1132 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1133 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1134 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1135 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1136 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1138 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1139 defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1140 ssmem, sse_load_f32, "cvtss2si",
1141 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1142 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1143 ssmem, sse_load_f32, "cvtss2si",
1144 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
1146 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1147 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1148 ssmem, sse_load_f32, "cvtss2si",
1149 WriteCvtSS2I, SSEPackedSingle>, XS;
1150 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1151 ssmem, sse_load_f32, "cvtss2si",
1152 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1154 defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1155 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1156 SSEPackedSingle, WriteCvtI2PS>,
1157 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1158 defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1159 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1160 SSEPackedSingle, WriteCvtI2PSY>,
1161 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1163 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1164 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1165 SSEPackedSingle, WriteCvtI2PS>,
1166 PS, Requires<[UseSSE2]>;
1170 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1171 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1172 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1173 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1174 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1175 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1176 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1177 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1178 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1179 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1180 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1181 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1182 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1183 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1184 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1185 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1188 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1189 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1190 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1191 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1192 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1193 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1194 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1195 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1196 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1197 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1198 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1199 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1200 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1201 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1202 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1203 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1207 // Convert scalar double to scalar single
1208 let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
1209 def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1210 (ins FR32:$src1, FR64:$src2),
1211 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1212 VEX_4V, VEX_LIG, VEX_WIG,
1213 Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1215 def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1216 (ins FR32:$src1, f64mem:$src2),
1217 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1218 XD, VEX_4V, VEX_LIG, VEX_WIG,
1219 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1222 def : Pat<(f32 (any_fpround FR64:$src)),
1223 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1226 let isCodeGenOnly = 1 in {
1227 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1228 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1229 [(set FR32:$dst, (any_fpround FR64:$src))]>,
1230 Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1231 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1232 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1233 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1234 XD, Requires<[UseSSE2, OptForSize]>,
1235 Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
1238 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1239 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1240 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1241 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1243 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1244 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1245 Sched<[WriteCvtSD2SS]>;
1246 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1247 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1248 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1250 (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>,
1251 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1252 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1253 let Constraints = "$src1 = $dst" in {
1254 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1255 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1256 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1258 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1259 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1260 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1261 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1262 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1264 (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>,
1265 XD, Requires<[UseSSE2]>,
1266 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1270 // Convert scalar single to scalar double
1271 // SSE2 instructions with XS prefix
1272 let isCodeGenOnly = 1, hasSideEffects = 0 in {
1273 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1274 (ins FR64:$src1, FR32:$src2),
1275 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1276 XS, VEX_4V, VEX_LIG, VEX_WIG,
1277 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1279 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1280 (ins FR64:$src1, f32mem:$src2),
1281 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1282 XS, VEX_4V, VEX_LIG, VEX_WIG,
1283 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1284 Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1285 } // isCodeGenOnly = 1, hasSideEffects = 0
1287 def : Pat<(f64 (any_fpextend FR32:$src)),
1288 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1289 def : Pat<(any_fpextend (loadf32 addr:$src)),
1290 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1292 let isCodeGenOnly = 1 in {
1293 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1294 "cvtss2sd\t{$src, $dst|$dst, $src}",
1295 [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1296 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1297 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1298 "cvtss2sd\t{$src, $dst|$dst, $src}",
1299 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1300 XS, Requires<[UseSSE2, OptForSize]>,
1301 Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
1302 } // isCodeGenOnly = 1
1304 let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
1305 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1306 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1307 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1308 []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1309 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1311 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1312 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1313 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1314 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1315 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1316 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1317 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1318 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1319 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1320 []>, XS, Requires<[UseSSE2]>,
1321 Sched<[WriteCvtSS2SD]>;
1323 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1324 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1325 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1326 []>, XS, Requires<[UseSSE2]>,
1327 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1329 } // hasSideEffects = 0
1331 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1332 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1333 // vmovs{s,d} instructions
1334 let Predicates = [UseAVX] in {
1335 def : Pat<(v4f32 (X86Movss
1337 (v4f32 (scalar_to_vector
1338 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1339 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1341 def : Pat<(v2f64 (X86Movsd
1343 (v2f64 (scalar_to_vector
1344 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1345 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1347 def : Pat<(v4f32 (X86Movss
1349 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1350 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1352 def : Pat<(v4f32 (X86Movss
1354 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1355 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1357 def : Pat<(v4f32 (X86Movss
1359 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1360 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1362 def : Pat<(v4f32 (X86Movss
1364 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1365 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1367 def : Pat<(v2f64 (X86Movsd
1369 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1370 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1372 def : Pat<(v2f64 (X86Movsd
1374 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1375 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1377 def : Pat<(v2f64 (X86Movsd
1379 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1380 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1382 def : Pat<(v2f64 (X86Movsd
1384 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1385 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1386 } // Predicates = [UseAVX]
1388 let Predicates = [UseSSE2] in {
1389 def : Pat<(v4f32 (X86Movss
1391 (v4f32 (scalar_to_vector
1392 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1393 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1395 def : Pat<(v2f64 (X86Movsd
1397 (v2f64 (scalar_to_vector
1398 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1399 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1401 def : Pat<(v2f64 (X86Movsd
1403 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1404 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1406 def : Pat<(v2f64 (X86Movsd
1408 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1409 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1411 def : Pat<(v2f64 (X86Movsd
1413 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1414 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1416 def : Pat<(v2f64 (X86Movsd
1418 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1419 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1420 } // Predicates = [UseSSE2]
1422 let Predicates = [UseSSE1] in {
1423 def : Pat<(v4f32 (X86Movss
1425 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1426 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1428 def : Pat<(v4f32 (X86Movss
1430 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1431 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1433 def : Pat<(v4f32 (X86Movss
1435 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1436 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1438 def : Pat<(v4f32 (X86Movss
1440 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1441 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1442 } // Predicates = [UseSSE1]
1444 let Predicates = [HasAVX, NoVLX] in {
1445 // Convert packed single/double fp to doubleword
1446 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1447 "cvtps2dq\t{$src, $dst|$dst, $src}",
1448 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1449 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
1450 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1451 "cvtps2dq\t{$src, $dst|$dst, $src}",
1453 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1454 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
1455 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1456 "cvtps2dq\t{$src, $dst|$dst, $src}",
1458 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1459 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
1460 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1461 "cvtps2dq\t{$src, $dst|$dst, $src}",
1463 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1464 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
1466 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1467 "cvtps2dq\t{$src, $dst|$dst, $src}",
1468 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1469 Sched<[WriteCvtPS2I]>, SIMD_EXC;
1470 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1471 "cvtps2dq\t{$src, $dst|$dst, $src}",
1473 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1474 Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1477 // Convert Packed Double FP to Packed DW Integers
1478 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1479 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1480 // register, but the same isn't true when using memory operands instead.
1481 // Provide other assembly rr and rm forms to address this explicitly.
1482 def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1483 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1485 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1486 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1489 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1490 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1492 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1493 Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1496 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1497 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1499 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1500 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1501 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1502 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1504 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1505 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1508 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1509 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1510 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1511 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1513 def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1514 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1516 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1517 Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1518 def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1519 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1521 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1522 Sched<[WriteCvtPD2I]>, SIMD_EXC;
1524 // Convert with truncation packed single/double fp to doubleword
1525 // SSE2 packed instructions with XS prefix
1526 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1527 let Predicates = [HasAVX, NoVLX] in {
1528 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1529 "cvttps2dq\t{$src, $dst|$dst, $src}",
1531 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1532 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1533 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1534 "cvttps2dq\t{$src, $dst|$dst, $src}",
1536 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1537 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1538 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1539 "cvttps2dq\t{$src, $dst|$dst, $src}",
1541 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1542 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1543 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1544 "cvttps2dq\t{$src, $dst|$dst, $src}",
1546 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1548 Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1551 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1552 "cvttps2dq\t{$src, $dst|$dst, $src}",
1554 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1555 Sched<[WriteCvtPS2I]>;
1556 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1557 "cvttps2dq\t{$src, $dst|$dst, $src}",
1559 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1560 Sched<[WriteCvtPS2ILd]>;
1563 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1564 // register, but the same isn't true when using memory operands instead.
1565 // Provide other assembly rr and rm forms to address this explicitly.
1566 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1568 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1569 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1571 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1572 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1573 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1574 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1576 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1577 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1580 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1581 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1583 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1584 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1585 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1586 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1588 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1589 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1590 } // Predicates = [HasAVX, NoVLX]
1592 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1593 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1594 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1595 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1597 let Predicates = [HasAVX, NoVLX] in {
1598 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1599 (VCVTTPD2DQYrr VR256:$src)>;
1600 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1601 (VCVTTPD2DQYrm addr:$src)>;
1604 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1605 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1607 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1608 Sched<[WriteCvtPD2I]>, SIMD_EXC;
1609 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1610 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1612 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1613 Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1615 // Convert packed single to packed double
1616 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1617 // SSE2 instructions without OpSize prefix
1618 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1619 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1620 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1621 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1622 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1623 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1624 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1625 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1626 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1627 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1628 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1629 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1630 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1631 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1632 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1633 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1636 let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1637 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1638 "cvtps2pd\t{$src, $dst|$dst, $src}",
1639 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1640 PS, Sched<[WriteCvtPS2PD]>;
1641 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1642 "cvtps2pd\t{$src, $dst|$dst, $src}",
1643 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1644 PS, Sched<[WriteCvtPS2PD.Folded]>;
1647 // Convert Packed DW Integers to Packed Double FP
1648 let Predicates = [HasAVX, NoVLX] in {
1649 let hasSideEffects = 0, mayLoad = 1 in
1650 def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1651 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1653 (v2f64 (X86any_VSintToFP
1655 (v2i64 (scalar_to_vector
1656 (loadi64 addr:$src)))))))]>,
1657 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1658 def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1659 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1661 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1662 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1663 def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1664 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1666 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1667 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1669 def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1670 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1672 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1673 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1676 let hasSideEffects = 0, mayLoad = 1 in
1677 def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1678 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1680 (v2f64 (X86any_VSintToFP
1682 (v2i64 (scalar_to_vector
1683 (loadi64 addr:$src)))))))]>,
1684 Sched<[WriteCvtI2PDLd]>;
1685 def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1686 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1688 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1689 Sched<[WriteCvtI2PD]>;
1691 // AVX register conversion intrinsics
1692 let Predicates = [HasAVX, NoVLX] in {
1693 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1694 (VCVTDQ2PDrm addr:$src)>;
1695 } // Predicates = [HasAVX, NoVLX]
1697 // SSE2 register conversion intrinsics
1698 let Predicates = [UseSSE2] in {
1699 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1700 (CVTDQ2PDrm addr:$src)>;
1701 } // Predicates = [UseSSE2]
1703 // Convert packed double to packed single
1704 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1705 // register, but the same isn't true when using memory operands instead.
1706 // Provide other assembly rr and rm forms to address this explicitly.
1707 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1709 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1710 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1711 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
1712 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1713 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1714 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1715 [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>,
1716 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1718 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1719 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1720 [(set VR128:$dst, (X86any_vfpround VR256:$src))]>,
1721 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1722 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1723 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1724 [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>,
1725 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1726 } // Predicates = [HasAVX, NoVLX]
1728 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1729 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1730 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1731 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1733 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1734 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1735 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
1736 Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1737 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1738 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1739 [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>,
1740 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1742 //===----------------------------------------------------------------------===//
1743 // SSE 1 & 2 - Compare Instructions
1744 //===----------------------------------------------------------------------===//
1746 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1747 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1748 SDNode OpNode, ValueType VT,
1749 PatFrag ld_frag, string asm,
1750 X86FoldableSchedWrite sched> {
1751 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1752 let isCommutable = 1 in
1753 def rr : SIi8<0xC2, MRMSrcReg,
1754 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1755 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
1757 def rm : SIi8<0xC2, MRMSrcMem,
1758 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1759 [(set RC:$dst, (OpNode (VT RC:$src1),
1760 (ld_frag addr:$src2), timm:$cc))]>,
1761 Sched<[sched.Folded, sched.ReadAfterFold]>;
1765 let isCodeGenOnly = 1 in {
1766 let ExeDomain = SSEPackedSingle in
1767 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1768 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1769 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
1770 let ExeDomain = SSEPackedDouble in
1771 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1772 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1773 SchedWriteFCmpSizes.PD.Scl>,
1774 XD, VEX_4V, VEX_LIG, VEX_WIG;
1776 let Constraints = "$src1 = $dst" in {
1777 let ExeDomain = SSEPackedSingle in
1778 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1779 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1780 SchedWriteFCmpSizes.PS.Scl>, XS;
1781 let ExeDomain = SSEPackedDouble in
1782 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1783 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1784 SchedWriteFCmpSizes.PD.Scl>, XD;
1788 multiclass sse12_cmp_scalar_int<Operand memop,
1789 Intrinsic Int, string asm, X86FoldableSchedWrite sched,
1790 ComplexPattern mem_cpat> {
1791 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1792 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1793 (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
1794 [(set VR128:$dst, (Int VR128:$src1,
1795 VR128:$src, timm:$cc))]>,
1798 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1799 (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
1800 [(set VR128:$dst, (Int VR128:$src1,
1801 mem_cpat:$src, timm:$cc))]>,
1802 Sched<[sched.Folded, sched.ReadAfterFold]>;
1806 // Aliases to match intrinsics which expect XMM operand(s).
1807 let ExeDomain = SSEPackedSingle in
1808 defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1809 "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1810 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1811 XS, VEX_4V, VEX_LIG, VEX_WIG;
1812 let ExeDomain = SSEPackedDouble in
1813 defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1814 "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1815 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1816 XD, VEX_4V, VEX_LIG, VEX_WIG;
1817 let Constraints = "$src1 = $dst" in {
1818 let ExeDomain = SSEPackedSingle in
1819 defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1820 "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}",
1821 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1822 let ExeDomain = SSEPackedDouble in
1823 defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1824 "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}",
1825 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1829 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1830 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1831 ValueType vt, X86MemOperand x86memop,
1832 PatFrag ld_frag, string OpcodeStr, Domain d,
1833 X86FoldableSchedWrite sched = WriteFCom> {
1834 let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
1836 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1837 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1838 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1841 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1842 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1843 [(set EFLAGS, (OpNode (vt RC:$src1),
1844 (ld_frag addr:$src2)))]>,
1845 Sched<[sched.Folded, sched.ReadAfterFold]>;
1849 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1850 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1851 ValueType vt, Operand memop,
1852 ComplexPattern mem_cpat, string OpcodeStr,
1854 X86FoldableSchedWrite sched = WriteFCom> {
1855 let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in {
1856 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1857 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1858 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1861 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1862 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1863 [(set EFLAGS, (OpNode (vt RC:$src1),
1865 Sched<[sched.Folded, sched.ReadAfterFold]>;
1869 let Defs = [EFLAGS] in {
1870 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1871 "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1872 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1873 "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1874 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1875 "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1876 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1877 "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1879 let isCodeGenOnly = 1 in {
1880 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1881 sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1882 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1883 sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1885 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1886 sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1887 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1888 sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1890 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1891 "ucomiss", SSEPackedSingle>, PS;
1892 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1893 "ucomisd", SSEPackedDouble>, PD;
1894 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1895 "comiss", SSEPackedSingle>, PS;
1896 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1897 "comisd", SSEPackedDouble>, PD;
1899 let isCodeGenOnly = 1 in {
1900 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1901 sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1902 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1903 sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1905 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1906 sse_load_f32, "comiss", SSEPackedSingle>, PS;
1907 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1908 sse_load_f64, "comisd", SSEPackedDouble>, PD;
1910 } // Defs = [EFLAGS]
1912 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
1913 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1914 ValueType VT, string asm,
1915 X86FoldableSchedWrite sched,
1916 Domain d, PatFrag ld_frag> {
1917 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1918 let isCommutable = 1 in
1919 def rri : PIi8<0xC2, MRMSrcReg,
1920 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1921 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1923 def rmi : PIi8<0xC2, MRMSrcMem,
1924 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1926 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1927 Sched<[sched.Folded, sched.ReadAfterFold]>;
1931 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1932 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1933 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1934 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1935 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1936 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1937 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1938 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1939 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1940 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1941 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1942 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1943 let Constraints = "$src1 = $dst" in {
1944 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1945 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1946 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1947 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1948 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1949 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1952 def CommutableCMPCC : PatLeaf<(timm), [{
1953 uint64_t Imm = N->getZExtValue() & 0x7;
1954 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1957 // Patterns to select compares with loads in first operand.
1958 let Predicates = [HasAVX] in {
1959 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
1960 CommutableCMPCC:$cc)),
1961 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1963 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
1964 CommutableCMPCC:$cc)),
1965 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1967 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
1968 CommutableCMPCC:$cc)),
1969 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1971 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
1972 CommutableCMPCC:$cc)),
1973 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
1975 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1976 CommutableCMPCC:$cc)),
1977 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1979 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1980 CommutableCMPCC:$cc)),
1981 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
1984 let Predicates = [UseSSE2] in {
1985 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
1986 CommutableCMPCC:$cc)),
1987 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1989 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1990 CommutableCMPCC:$cc)),
1991 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1994 let Predicates = [UseSSE1] in {
1995 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
1996 CommutableCMPCC:$cc)),
1997 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
1999 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2000 CommutableCMPCC:$cc)),
2001 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2004 //===----------------------------------------------------------------------===//
2005 // SSE 1 & 2 - Shuffle Instructions
2006 //===----------------------------------------------------------------------===//
2008 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2009 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2010 ValueType vt, string asm, PatFrag mem_frag,
2011 X86FoldableSchedWrite sched, Domain d,
2012 bit IsCommutable = 0> {
2013 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2014 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2015 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2016 (i8 timm:$src3))))], d>,
2017 Sched<[sched.Folded, sched.ReadAfterFold]>;
2018 let isCommutable = IsCommutable in
2019 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2020 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2021 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2022 (i8 timm:$src3))))], d>,
2026 let Predicates = [HasAVX, NoVLX] in {
2027 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2028 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2029 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2030 PS, VEX_4V, VEX_WIG;
2031 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2032 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2033 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2034 PS, VEX_4V, VEX_L, VEX_WIG;
2035 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2036 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2037 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2038 PD, VEX_4V, VEX_WIG;
2039 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2040 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2041 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2042 PD, VEX_4V, VEX_L, VEX_WIG;
2044 let Constraints = "$src1 = $dst" in {
2045 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2046 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2047 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2048 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2049 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2050 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2053 //===----------------------------------------------------------------------===//
2054 // SSE 1 & 2 - Unpack FP Instructions
2055 //===----------------------------------------------------------------------===//
2057 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2058 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2059 PatFrag mem_frag, RegisterClass RC,
2060 X86MemOperand x86memop, string asm,
2061 X86FoldableSchedWrite sched, Domain d,
2062 bit IsCommutable = 0> {
2063 let isCommutable = IsCommutable in
2064 def rr : PI<opc, MRMSrcReg,
2065 (outs RC:$dst), (ins RC:$src1, RC:$src2),
2067 (vt (OpNode RC:$src1, RC:$src2)))], d>,
2069 def rm : PI<opc, MRMSrcMem,
2070 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2072 (vt (OpNode RC:$src1,
2073 (mem_frag addr:$src2))))], d>,
2074 Sched<[sched.Folded, sched.ReadAfterFold]>;
2077 let Predicates = [HasAVX, NoVLX] in {
2078 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2079 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2080 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2081 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2082 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2083 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2084 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2085 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2086 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2087 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2088 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2089 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2091 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2092 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2093 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2094 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2095 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2096 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2097 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2098 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2099 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2100 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2101 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2102 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2103 }// Predicates = [HasAVX, NoVLX]
2105 let Constraints = "$src1 = $dst" in {
2106 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2107 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2108 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2109 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2110 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2111 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2112 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2113 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2114 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2115 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2116 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2117 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2118 } // Constraints = "$src1 = $dst"
2120 let Predicates = [HasAVX1Only] in {
2121 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2122 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2123 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2124 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2125 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2126 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2127 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2128 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2130 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2131 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2132 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2133 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2134 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2135 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2136 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2137 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2140 let Predicates = [UseSSE2] in {
2141 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2142 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2143 (v2f64 (simple_load addr:$src2)))),
2144 (MOVHPDrm VR128:$src1, addr:$src2)>;
2147 //===----------------------------------------------------------------------===//
2148 // SSE 1 & 2 - Extract Floating-Point Sign mask
2149 //===----------------------------------------------------------------------===//
2151 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2152 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2153 string asm, Domain d> {
2154 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2155 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2156 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2157 Sched<[WriteFMOVMSK]>;
2160 let Predicates = [HasAVX] in {
2161 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2162 SSEPackedSingle>, PS, VEX, VEX_WIG;
2163 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2164 SSEPackedDouble>, PD, VEX, VEX_WIG;
2165 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2166 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2167 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2168 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2170 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2171 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2172 (VMOVMSKPSrr VR128:$src)>;
2173 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2174 (VMOVMSKPDrr VR128:$src)>;
2175 def : Pat<(X86movmsk (v8i32 VR256:$src)),
2176 (VMOVMSKPSYrr VR256:$src)>;
2177 def : Pat<(X86movmsk (v4i64 VR256:$src)),
2178 (VMOVMSKPDYrr VR256:$src)>;
2181 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2182 SSEPackedSingle>, PS;
2183 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2184 SSEPackedDouble>, PD;
2186 let Predicates = [UseSSE2] in {
2187 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2188 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2189 (MOVMSKPSrr VR128:$src)>;
2190 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2191 (MOVMSKPDrr VR128:$src)>;
2194 //===---------------------------------------------------------------------===//
2195 // SSE2 - Packed Integer Logical Instructions
2196 //===---------------------------------------------------------------------===//
2198 let ExeDomain = SSEPackedInt in { // SSE integer instructions
2200 /// PDI_binop_rm - Simple SSE2 binary operator.
2201 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2202 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2203 X86MemOperand x86memop, X86FoldableSchedWrite sched,
2204 bit IsCommutable, bit Is2Addr> {
2205 let isCommutable = IsCommutable in
2206 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2207 (ins RC:$src1, RC:$src2),
2209 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2210 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2211 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2213 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2214 (ins RC:$src1, x86memop:$src2),
2216 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2217 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2218 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2219 Sched<[sched.Folded, sched.ReadAfterFold]>;
2221 } // ExeDomain = SSEPackedInt
2223 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2224 ValueType OpVT128, ValueType OpVT256,
2225 X86SchedWriteWidths sched, bit IsCommutable,
2227 let Predicates = [HasAVX, prd] in
2228 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2229 VR128, load, i128mem, sched.XMM,
2230 IsCommutable, 0>, VEX_4V, VEX_WIG;
2232 let Constraints = "$src1 = $dst" in
2233 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2234 memop, i128mem, sched.XMM, IsCommutable, 1>;
2236 let Predicates = [HasAVX2, prd] in
2237 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2238 OpVT256, VR256, load, i256mem, sched.YMM,
2239 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2242 // These are ordered here for pattern ordering requirements with the fp versions
2244 defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2245 SchedWriteVecLogic, 1, NoVLX>;
2246 defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2247 SchedWriteVecLogic, 1, NoVLX>;
2248 defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2249 SchedWriteVecLogic, 1, NoVLX>;
2250 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2251 SchedWriteVecLogic, 0, NoVLX>;
2253 //===----------------------------------------------------------------------===//
2254 // SSE 1 & 2 - Logical Instructions
2255 //===----------------------------------------------------------------------===//
2257 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2259 /// There are no patterns here because isel prefers integer versions for SSE2
2260 /// and later. There are SSE1 v4f32 patterns later.
2261 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2262 SDNode OpNode, X86SchedWriteWidths sched> {
2263 let Predicates = [HasAVX, NoVLX] in {
2264 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2265 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2266 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2268 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2269 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2270 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2272 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2273 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2274 [], [], 0>, PS, VEX_4V, VEX_WIG;
2276 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2277 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2278 [], [], 0>, PD, VEX_4V, VEX_WIG;
2281 let Constraints = "$src1 = $dst" in {
2282 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2283 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2286 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2287 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2292 defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2293 defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2294 defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2295 let isCommutable = 0 in
2296 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2298 let Predicates = [HasAVX2, NoVLX] in {
2299 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2300 (VPANDYrr VR256:$src1, VR256:$src2)>;
2301 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2302 (VPANDYrr VR256:$src1, VR256:$src2)>;
2303 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2304 (VPANDYrr VR256:$src1, VR256:$src2)>;
2306 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2307 (VPORYrr VR256:$src1, VR256:$src2)>;
2308 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2309 (VPORYrr VR256:$src1, VR256:$src2)>;
2310 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2311 (VPORYrr VR256:$src1, VR256:$src2)>;
2313 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2314 (VPXORYrr VR256:$src1, VR256:$src2)>;
2315 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2316 (VPXORYrr VR256:$src1, VR256:$src2)>;
2317 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2318 (VPXORYrr VR256:$src1, VR256:$src2)>;
2320 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2321 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2322 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2323 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2324 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2325 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2327 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2328 (VPANDYrm VR256:$src1, addr:$src2)>;
2329 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2330 (VPANDYrm VR256:$src1, addr:$src2)>;
2331 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2332 (VPANDYrm VR256:$src1, addr:$src2)>;
2334 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2335 (VPORYrm VR256:$src1, addr:$src2)>;
2336 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2337 (VPORYrm VR256:$src1, addr:$src2)>;
2338 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2339 (VPORYrm VR256:$src1, addr:$src2)>;
2341 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2342 (VPXORYrm VR256:$src1, addr:$src2)>;
2343 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2344 (VPXORYrm VR256:$src1, addr:$src2)>;
2345 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2346 (VPXORYrm VR256:$src1, addr:$src2)>;
2348 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2349 (VPANDNYrm VR256:$src1, addr:$src2)>;
2350 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2351 (VPANDNYrm VR256:$src1, addr:$src2)>;
2352 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2353 (VPANDNYrm VR256:$src1, addr:$src2)>;
2356 // If only AVX1 is supported, we need to handle integer operations with
2357 // floating point instructions since the integer versions aren't available.
2358 let Predicates = [HasAVX1Only] in {
2359 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2360 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2361 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2362 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2363 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2364 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2365 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2366 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2368 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2369 (VORPSYrr VR256:$src1, VR256:$src2)>;
2370 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2371 (VORPSYrr VR256:$src1, VR256:$src2)>;
2372 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2373 (VORPSYrr VR256:$src1, VR256:$src2)>;
2374 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2375 (VORPSYrr VR256:$src1, VR256:$src2)>;
2377 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2378 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2379 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2380 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2381 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2382 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2383 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2384 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2386 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2387 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2388 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2389 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2390 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2391 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2392 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2393 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2395 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2396 (VANDPSYrm VR256:$src1, addr:$src2)>;
2397 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2398 (VANDPSYrm VR256:$src1, addr:$src2)>;
2399 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2400 (VANDPSYrm VR256:$src1, addr:$src2)>;
2401 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2402 (VANDPSYrm VR256:$src1, addr:$src2)>;
2404 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2405 (VORPSYrm VR256:$src1, addr:$src2)>;
2406 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2407 (VORPSYrm VR256:$src1, addr:$src2)>;
2408 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2409 (VORPSYrm VR256:$src1, addr:$src2)>;
2410 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2411 (VORPSYrm VR256:$src1, addr:$src2)>;
2413 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2414 (VXORPSYrm VR256:$src1, addr:$src2)>;
2415 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2416 (VXORPSYrm VR256:$src1, addr:$src2)>;
2417 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2418 (VXORPSYrm VR256:$src1, addr:$src2)>;
2419 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2420 (VXORPSYrm VR256:$src1, addr:$src2)>;
2422 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2423 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2424 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2425 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2426 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2427 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2428 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2429 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2432 let Predicates = [HasAVX, NoVLX] in {
2433 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2434 (VPANDrr VR128:$src1, VR128:$src2)>;
2435 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2436 (VPANDrr VR128:$src1, VR128:$src2)>;
2437 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2438 (VPANDrr VR128:$src1, VR128:$src2)>;
2440 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2441 (VPORrr VR128:$src1, VR128:$src2)>;
2442 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2443 (VPORrr VR128:$src1, VR128:$src2)>;
2444 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2445 (VPORrr VR128:$src1, VR128:$src2)>;
2447 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2448 (VPXORrr VR128:$src1, VR128:$src2)>;
2449 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2450 (VPXORrr VR128:$src1, VR128:$src2)>;
2451 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2452 (VPXORrr VR128:$src1, VR128:$src2)>;
2454 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2455 (VPANDNrr VR128:$src1, VR128:$src2)>;
2456 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2457 (VPANDNrr VR128:$src1, VR128:$src2)>;
2458 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2459 (VPANDNrr VR128:$src1, VR128:$src2)>;
2461 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2462 (VPANDrm VR128:$src1, addr:$src2)>;
2463 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2464 (VPANDrm VR128:$src1, addr:$src2)>;
2465 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2466 (VPANDrm VR128:$src1, addr:$src2)>;
2468 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2469 (VPORrm VR128:$src1, addr:$src2)>;
2470 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2471 (VPORrm VR128:$src1, addr:$src2)>;
2472 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2473 (VPORrm VR128:$src1, addr:$src2)>;
2475 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2476 (VPXORrm VR128:$src1, addr:$src2)>;
2477 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2478 (VPXORrm VR128:$src1, addr:$src2)>;
2479 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2480 (VPXORrm VR128:$src1, addr:$src2)>;
2482 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2483 (VPANDNrm VR128:$src1, addr:$src2)>;
2484 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2485 (VPANDNrm VR128:$src1, addr:$src2)>;
2486 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2487 (VPANDNrm VR128:$src1, addr:$src2)>;
2490 let Predicates = [UseSSE2] in {
2491 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2492 (PANDrr VR128:$src1, VR128:$src2)>;
2493 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2494 (PANDrr VR128:$src1, VR128:$src2)>;
2495 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2496 (PANDrr VR128:$src1, VR128:$src2)>;
2498 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2499 (PORrr VR128:$src1, VR128:$src2)>;
2500 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2501 (PORrr VR128:$src1, VR128:$src2)>;
2502 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2503 (PORrr VR128:$src1, VR128:$src2)>;
2505 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2506 (PXORrr VR128:$src1, VR128:$src2)>;
2507 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2508 (PXORrr VR128:$src1, VR128:$src2)>;
2509 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2510 (PXORrr VR128:$src1, VR128:$src2)>;
2512 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2513 (PANDNrr VR128:$src1, VR128:$src2)>;
2514 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2515 (PANDNrr VR128:$src1, VR128:$src2)>;
2516 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2517 (PANDNrr VR128:$src1, VR128:$src2)>;
2519 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2520 (PANDrm VR128:$src1, addr:$src2)>;
2521 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2522 (PANDrm VR128:$src1, addr:$src2)>;
2523 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2524 (PANDrm VR128:$src1, addr:$src2)>;
2526 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2527 (PORrm VR128:$src1, addr:$src2)>;
2528 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2529 (PORrm VR128:$src1, addr:$src2)>;
2530 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2531 (PORrm VR128:$src1, addr:$src2)>;
2533 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2534 (PXORrm VR128:$src1, addr:$src2)>;
2535 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2536 (PXORrm VR128:$src1, addr:$src2)>;
2537 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2538 (PXORrm VR128:$src1, addr:$src2)>;
2540 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2541 (PANDNrm VR128:$src1, addr:$src2)>;
2542 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2543 (PANDNrm VR128:$src1, addr:$src2)>;
2544 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2545 (PANDNrm VR128:$src1, addr:$src2)>;
2548 // Patterns for packed operations when we don't have integer type available.
2549 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2550 (ANDPSrr VR128:$src1, VR128:$src2)>;
2551 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2552 (ORPSrr VR128:$src1, VR128:$src2)>;
2553 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2554 (XORPSrr VR128:$src1, VR128:$src2)>;
2555 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2556 (ANDNPSrr VR128:$src1, VR128:$src2)>;
2558 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2559 (ANDPSrm VR128:$src1, addr:$src2)>;
2560 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2561 (ORPSrm VR128:$src1, addr:$src2)>;
2562 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2563 (XORPSrm VR128:$src1, addr:$src2)>;
2564 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2565 (ANDNPSrm VR128:$src1, addr:$src2)>;
2567 //===----------------------------------------------------------------------===//
2568 // SSE 1 & 2 - Arithmetic Instructions
2569 //===----------------------------------------------------------------------===//
2571 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2574 /// In addition, we also have a special variant of the scalar form here to
2575 /// represent the associated intrinsic operation. This form is unlike the
2576 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
2577 /// and leaves the top elements unmodified (therefore these cannot be commuted).
2579 /// These three forms can each be reg+reg or reg+mem.
2582 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2584 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2585 SDNode OpNode, X86SchedWriteSizes sched> {
2586 let Uses = [MXCSR], mayRaiseFPException = 1 in {
2587 let Predicates = [HasAVX, NoVLX] in {
2588 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2589 VR128, v4f32, f128mem, loadv4f32,
2590 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2591 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2592 VR128, v2f64, f128mem, loadv2f64,
2593 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2595 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2596 OpNode, VR256, v8f32, f256mem, loadv8f32,
2597 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2598 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2599 OpNode, VR256, v4f64, f256mem, loadv4f64,
2600 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2603 let Constraints = "$src1 = $dst" in {
2604 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2605 v4f32, f128mem, memopv4f32, SSEPackedSingle,
2607 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2608 v2f64, f128mem, memopv2f64, SSEPackedDouble,
2614 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2615 X86SchedWriteSizes sched> {
2616 let Uses = [MXCSR], mayRaiseFPException = 1 in {
2617 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2618 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2619 XS, VEX_4V, VEX_LIG, VEX_WIG;
2620 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2621 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2622 XD, VEX_4V, VEX_LIG, VEX_WIG;
2624 let Constraints = "$src1 = $dst" in {
2625 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2626 OpNode, FR32, f32mem, SSEPackedSingle,
2628 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2629 OpNode, FR64, f64mem, SSEPackedDouble,
2635 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2636 SDPatternOperator OpNode,
2637 X86SchedWriteSizes sched> {
2638 let Uses = [MXCSR], mayRaiseFPException = 1 in {
2639 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2640 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2641 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2642 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2643 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2644 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2646 let Constraints = "$src1 = $dst" in {
2647 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2648 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2649 SSEPackedSingle, sched.PS.Scl>, XS;
2650 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2651 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2652 SSEPackedDouble, sched.PD.Scl>, XD;
2657 // Binary Arithmetic instructions
2658 defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2659 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2660 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2661 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2662 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2663 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2664 let isCommutable = 0 in {
2665 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2666 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2667 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2668 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2669 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2670 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2671 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2672 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2673 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2674 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2675 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2676 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2679 let isCodeGenOnly = 1 in {
2680 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2681 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2682 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2683 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2686 // Patterns used to select SSE scalar fp arithmetic instructions from
2689 // (1) a scalar fp operation followed by a blend
2691 // The effect is that the backend no longer emits unnecessary vector
2692 // insert instructions immediately after SSE scalar fp instructions
2693 // like addss or mulss.
2695 // For example, given the following code:
2696 // __m128 foo(__m128 A, __m128 B) {
2701 // Previously we generated:
2702 // addss %xmm0, %xmm1
2703 // movss %xmm1, %xmm0
2706 // addss %xmm1, %xmm0
2708 // (2) a vector packed single/double fp operation followed by a vector insert
2710 // The effect is that the backend converts the packed fp instruction
2711 // followed by a vector insert into a single SSE scalar fp instruction.
2713 // For example, given the following code:
2714 // __m128 foo(__m128 A, __m128 B) {
2715 // __m128 C = A + B;
2716 // return (__m128) {c[0], a[1], a[2], a[3]};
2719 // Previously we generated:
2720 // addps %xmm0, %xmm1
2721 // movss %xmm1, %xmm0
2724 // addss %xmm1, %xmm0
2726 // TODO: Some canonicalization in lowering would simplify the number of
2727 // patterns we have to try to match.
2728 multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2729 ValueType VT, ValueType EltTy,
2730 RegisterClass RC, PatFrag ld_frag,
2731 Predicate BasePredicate> {
2732 let Predicates = [BasePredicate] in {
2733 // extracted scalar math op with insert via movss/movsd
2734 def : Pat<(VT (Move (VT VR128:$dst),
2735 (VT (scalar_to_vector
2736 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2738 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2739 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2740 def : Pat<(VT (Move (VT VR128:$dst),
2741 (VT (scalar_to_vector
2742 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2743 (ld_frag addr:$src)))))),
2744 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2747 // Repeat for AVX versions of the instructions.
2748 let Predicates = [UseAVX] in {
2749 // extracted scalar math op with insert via movss/movsd
2750 def : Pat<(VT (Move (VT VR128:$dst),
2751 (VT (scalar_to_vector
2752 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2754 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2755 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2756 def : Pat<(VT (Move (VT VR128:$dst),
2757 (VT (scalar_to_vector
2758 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2759 (ld_frag addr:$src)))))),
2760 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2764 defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2765 defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2766 defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2767 defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2769 defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2770 defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2771 defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2772 defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2775 /// In addition, we also have a special variant of the scalar form here to
2776 /// represent the associated intrinsic operation. This form is unlike the
2777 /// plain scalar form, in that it takes an entire vector (instead of a
2778 /// scalar) and leaves the top elements undefined.
2780 /// And, we have a special variant form for a full-vector intrinsic form.
2782 /// sse_fp_unop_s - SSE1 unops in scalar form
2783 /// For the non-AVX defs, we need $src1 to be tied to $dst because
2784 /// the HW instructions are 2 operand / destructive.
2785 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2786 ValueType ScalarVT, X86MemOperand x86memop,
2787 Operand intmemop, SDNode OpNode, Domain d,
2788 X86FoldableSchedWrite sched, Predicate target> {
2789 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2790 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2791 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2792 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2795 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2796 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2797 [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2798 Sched<[sched.Folded]>,
2799 Requires<[target, OptForSize]>;
2802 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2803 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2804 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2807 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2808 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2809 Sched<[sched.Folded, sched.ReadAfterFold]>;
2814 multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2815 ComplexPattern int_cpat, Intrinsic Intr,
2816 Predicate target, string Suffix> {
2817 let Predicates = [target] in {
2818 // These are unary operations, but they are modeled as having 2 source operands
2819 // because the high elements of the destination are unchanged in SSE.
2820 def : Pat<(Intr VR128:$src),
2821 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2823 // We don't want to fold scalar loads into these instructions unless
2824 // optimizing for size. This is because the folded instruction will have a
2825 // partial register update, while the unfolded sequence will not, e.g.
2827 // rcpss %xmm0, %xmm0
2828 // which has a clobber before the rcp, vs.
2830 let Predicates = [target, OptForSize] in {
2831 def : Pat<(Intr int_cpat:$src2),
2832 (!cast<Instruction>(NAME#m_Int)
2833 (vt (IMPLICIT_DEF)), addr:$src2)>;
2837 multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
2838 Intrinsic Intr, Predicate target> {
2839 let Predicates = [target] in {
2840 def : Pat<(Intr VR128:$src),
2841 (!cast<Instruction>(NAME#r_Int) VR128:$src,
2844 let Predicates = [target, OptForSize] in {
2845 def : Pat<(Intr int_cpat:$src2),
2846 (!cast<Instruction>(NAME#m_Int)
2847 (vt (IMPLICIT_DEF)), addr:$src2)>;
2851 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2852 ValueType ScalarVT, X86MemOperand x86memop,
2853 Operand intmemop, SDNode OpNode, Domain d,
2854 X86FoldableSchedWrite sched, Predicate target> {
2855 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2856 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2857 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2858 [], d>, Sched<[sched]>;
2860 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2861 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2862 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2864 let hasSideEffects = 0, ExeDomain = d in {
2865 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2866 (ins VR128:$src1, VR128:$src2),
2867 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2868 []>, Sched<[sched]>;
2870 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2871 (ins VR128:$src1, intmemop:$src2),
2872 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2873 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2876 // We don't want to fold scalar loads into these instructions unless
2877 // optimizing for size. This is because the folded instruction will have a
2878 // partial register update, while the unfolded sequence will not, e.g.
2879 // vmovss mem, %xmm0
2880 // vrcpss %xmm0, %xmm0, %xmm0
2881 // which has a clobber before the rcp, vs.
2882 // vrcpss mem, %xmm0, %xmm0
2883 // TODO: In theory, we could fold the load, and avoid the stall caused by
2884 // the partial register store, either in BreakFalseDeps or with smarter RA.
2885 let Predicates = [target] in {
2886 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
2887 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2889 let Predicates = [target, OptForSize] in {
2890 def : Pat<(ScalarVT (OpNode (load addr:$src))),
2891 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2896 /// sse1_fp_unop_p - SSE1 unops in packed form.
2897 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2898 X86SchedWriteWidths sched, list<Predicate> prds> {
2899 let Predicates = prds in {
2900 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2901 !strconcat("v", OpcodeStr,
2902 "ps\t{$src, $dst|$dst, $src}"),
2903 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2904 VEX, Sched<[sched.XMM]>, VEX_WIG;
2905 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2906 !strconcat("v", OpcodeStr,
2907 "ps\t{$src, $dst|$dst, $src}"),
2908 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2909 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2910 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2911 !strconcat("v", OpcodeStr,
2912 "ps\t{$src, $dst|$dst, $src}"),
2913 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2914 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2915 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2916 !strconcat("v", OpcodeStr,
2917 "ps\t{$src, $dst|$dst, $src}"),
2918 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2919 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2922 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2923 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2924 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2926 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2927 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2928 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2929 Sched<[sched.XMM.Folded]>;
2932 /// sse2_fp_unop_p - SSE2 unops in vector forms.
2933 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2934 SDNode OpNode, X86SchedWriteWidths sched> {
2935 let Predicates = [HasAVX, NoVLX] in {
2936 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2937 !strconcat("v", OpcodeStr,
2938 "pd\t{$src, $dst|$dst, $src}"),
2939 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2940 VEX, Sched<[sched.XMM]>, VEX_WIG;
2941 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2942 !strconcat("v", OpcodeStr,
2943 "pd\t{$src, $dst|$dst, $src}"),
2944 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2945 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2946 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2947 !strconcat("v", OpcodeStr,
2948 "pd\t{$src, $dst|$dst, $src}"),
2949 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2950 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2951 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2952 !strconcat("v", OpcodeStr,
2953 "pd\t{$src, $dst|$dst, $src}"),
2954 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2955 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2958 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2959 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2960 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2962 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2963 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2964 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
2965 Sched<[sched.XMM.Folded]>;
2968 multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
2969 X86SchedWriteWidths sched, Predicate AVXTarget> {
2970 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2971 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2973 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2974 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2976 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
2979 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2980 X86SchedWriteWidths sched, Predicate AVXTarget> {
2981 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
2982 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
2983 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
2984 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
2985 XS, VEX_4V, VEX_LIG, VEX_WIG;
2988 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2989 X86SchedWriteWidths sched, Predicate AVXTarget> {
2990 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
2991 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
2992 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
2993 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
2994 XD, VEX_4V, VEX_LIG, VEX_WIG;
2998 defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
2999 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3000 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3001 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3003 // Reciprocal approximations. Note that these typically require refinement
3004 // in order to obtain suitable precision.
3005 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3006 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3007 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3008 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3009 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3010 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3012 // There is no f64 version of the reciprocal approximation instructions.
3014 multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
3015 ValueType VT, Predicate BasePredicate> {
3016 let Predicates = [BasePredicate] in {
3017 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3018 (OpNode (extractelt VT:$src, 0))))),
3019 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3022 // Repeat for AVX versions of the instructions.
3023 let Predicates = [UseAVX] in {
3024 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3025 (OpNode (extractelt VT:$src, 0))))),
3026 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3030 defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3031 defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3033 multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3034 SDNode Move, ValueType VT,
3035 Predicate BasePredicate> {
3036 let Predicates = [BasePredicate] in {
3037 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3038 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3041 // Repeat for AVX versions of the instructions.
3042 let Predicates = [HasAVX] in {
3043 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3044 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3048 defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3050 defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3054 //===----------------------------------------------------------------------===//
3055 // SSE 1 & 2 - Non-temporal stores
3056 //===----------------------------------------------------------------------===//
3058 let AddedComplexity = 400 in { // Prefer non-temporal versions
3059 let Predicates = [HasAVX, NoVLX] in {
3060 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3061 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3062 (ins f128mem:$dst, VR128:$src),
3063 "movntps\t{$src, $dst|$dst, $src}",
3064 [(alignednontemporalstore (v4f32 VR128:$src),
3065 addr:$dst)]>, VEX, VEX_WIG;
3066 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3067 (ins f128mem:$dst, VR128:$src),
3068 "movntpd\t{$src, $dst|$dst, $src}",
3069 [(alignednontemporalstore (v2f64 VR128:$src),
3070 addr:$dst)]>, VEX, VEX_WIG;
3073 let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3074 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3075 (ins f256mem:$dst, VR256:$src),
3076 "movntps\t{$src, $dst|$dst, $src}",
3077 [(alignednontemporalstore (v8f32 VR256:$src),
3078 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3079 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3080 (ins f256mem:$dst, VR256:$src),
3081 "movntpd\t{$src, $dst|$dst, $src}",
3082 [(alignednontemporalstore (v4f64 VR256:$src),
3083 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3086 let ExeDomain = SSEPackedInt in {
3087 def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
3088 (ins i128mem:$dst, VR128:$src),
3089 "movntdq\t{$src, $dst|$dst, $src}",
3090 [(alignednontemporalstore (v2i64 VR128:$src),
3091 addr:$dst)]>, VEX, VEX_WIG,
3092 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3093 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3094 (ins i256mem:$dst, VR256:$src),
3095 "movntdq\t{$src, $dst|$dst, $src}",
3096 [(alignednontemporalstore (v4i64 VR256:$src),
3097 addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3098 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3102 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3103 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3104 "movntps\t{$src, $dst|$dst, $src}",
3105 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3106 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3107 "movntpd\t{$src, $dst|$dst, $src}",
3108 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3111 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3112 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3113 "movntdq\t{$src, $dst|$dst, $src}",
3114 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3116 let SchedRW = [WriteStoreNT] in {
3117 // There is no AVX form for instructions below this point
3118 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3119 "movnti{l}\t{$src, $dst|$dst, $src}",
3120 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3121 PS, Requires<[HasSSE2]>;
3122 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3123 "movnti{q}\t{$src, $dst|$dst, $src}",
3124 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3125 PS, Requires<[HasSSE2]>;
3126 } // SchedRW = [WriteStoreNT]
3128 let Predicates = [HasAVX, NoVLX] in {
3129 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3130 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3131 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3132 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3133 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3134 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3136 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3137 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3138 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3139 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3140 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3141 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3144 let Predicates = [UseSSE2] in {
3145 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3146 (MOVNTDQmr addr:$dst, VR128:$src)>;
3147 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3148 (MOVNTDQmr addr:$dst, VR128:$src)>;
3149 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3150 (MOVNTDQmr addr:$dst, VR128:$src)>;
3153 } // AddedComplexity
3155 //===----------------------------------------------------------------------===//
3156 // SSE 1 & 2 - Prefetch and memory fence
3157 //===----------------------------------------------------------------------===//
3159 // Prefetch intrinsic.
3160 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3161 def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3162 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3163 def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3164 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3165 def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3166 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3167 def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3168 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3171 // FIXME: How should flush instruction be modeled?
3172 let SchedRW = [WriteLoad] in {
3174 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3175 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3176 PS, Requires<[HasSSE2]>;
3179 let SchedRW = [WriteNop] in {
3180 // Pause. This "instruction" is encoded as "rep; nop", so even though it
3181 // was introduced with SSE2, it's backward compatible.
3182 def PAUSE : I<0x90, RawFrm, (outs), (ins),
3183 "pause", [(int_x86_sse2_pause)]>, OBXS;
3186 let SchedRW = [WriteFence] in {
3187 // Load, store, and memory fence
3188 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence
3189 // to include any 64-bit target.
3190 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3191 PS, Requires<[HasSSE1]>;
3192 def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3193 PS, Requires<[HasSSE2]>;
3194 def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3195 PS, Requires<[HasMFence]>;
3198 def : Pat<(X86MFence), (MFENCE)>;
3200 //===----------------------------------------------------------------------===//
3201 // SSE 1 & 2 - Load/Store XCSR register
3202 //===----------------------------------------------------------------------===//
3204 let mayLoad=1, hasSideEffects=1 in
3205 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3206 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3207 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3208 let mayStore=1, hasSideEffects=1 in
3209 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3210 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3211 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3213 let mayLoad=1, hasSideEffects=1 in
3214 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3215 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3216 TB, Sched<[WriteLDMXCSR]>;
3217 let mayStore=1, hasSideEffects=1 in
3218 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3219 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3220 TB, Sched<[WriteSTMXCSR]>;
3222 //===---------------------------------------------------------------------===//
3223 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3224 //===---------------------------------------------------------------------===//
3226 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3228 let hasSideEffects = 0 in {
3229 def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3230 "movdqa\t{$src, $dst|$dst, $src}", []>,
3231 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3232 def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3233 "movdqu\t{$src, $dst|$dst, $src}", []>,
3234 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3235 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3236 "movdqa\t{$src, $dst|$dst, $src}", []>,
3237 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3238 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3239 "movdqu\t{$src, $dst|$dst, $src}", []>,
3240 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3244 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3245 def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3246 "movdqa\t{$src, $dst|$dst, $src}", []>,
3247 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3248 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3249 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3250 "movdqa\t{$src, $dst|$dst, $src}", []>,
3251 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3252 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3253 def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3254 "movdqu\t{$src, $dst|$dst, $src}", []>,
3255 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3256 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3257 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3258 "movdqu\t{$src, $dst|$dst, $src}", []>,
3259 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3260 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3263 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3264 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3265 def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3266 "movdqa\t{$src, $dst|$dst, $src}",
3267 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3268 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3269 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3270 "movdqa\t{$src, $dst|$dst, $src}", []>,
3271 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3272 VEX, VEX_L, VEX_WIG;
3273 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3274 "vmovdqu\t{$src, $dst|$dst, $src}",
3275 [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3276 Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3278 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3279 "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3280 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3281 XS, VEX, VEX_L, VEX_WIG;
3284 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3285 def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
3286 (ins i128mem:$dst, VR128:$src),
3287 "movdqa\t{$src, $dst|$dst, $src}",
3288 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3289 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3290 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3291 (ins i256mem:$dst, VR256:$src),
3292 "movdqa\t{$src, $dst|$dst, $src}", []>,
3293 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3294 def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3295 "vmovdqu\t{$src, $dst|$dst, $src}",
3296 [(store (v2i64 VR128:$src), addr:$dst)]>,
3297 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3298 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3299 "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3300 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3303 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3304 let hasSideEffects = 0 in {
3305 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3306 "movdqa\t{$src, $dst|$dst, $src}", []>;
3308 def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3309 "movdqu\t{$src, $dst|$dst, $src}", []>,
3310 XS, Requires<[UseSSE2]>;
3314 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3315 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3316 "movdqa\t{$src, $dst|$dst, $src}", []>,
3317 FoldGenData<"MOVDQArr">;
3319 def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3320 "movdqu\t{$src, $dst|$dst, $src}", []>,
3321 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3325 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3326 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3327 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3328 "movdqa\t{$src, $dst|$dst, $src}",
3329 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3330 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3331 "movdqu\t{$src, $dst|$dst, $src}",
3332 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3333 XS, Requires<[UseSSE2]>;
3336 let mayStore = 1, hasSideEffects = 0,
3337 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3338 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3339 "movdqa\t{$src, $dst|$dst, $src}",
3340 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3341 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3342 "movdqu\t{$src, $dst|$dst, $src}",
3343 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3344 XS, Requires<[UseSSE2]>;
3347 } // ExeDomain = SSEPackedInt
3349 // Reversed version with ".s" suffix for GAS compatibility.
3350 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3351 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3352 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3353 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3354 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3355 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3356 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3357 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3359 // Reversed version with ".s" suffix for GAS compatibility.
3360 def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3361 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3362 def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3363 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3365 let Predicates = [HasAVX, NoVLX] in {
3366 // Additional patterns for other integer sizes.
3367 def : Pat<(alignedloadv4i32 addr:$src),
3368 (VMOVDQArm addr:$src)>;
3369 def : Pat<(alignedloadv8i16 addr:$src),
3370 (VMOVDQArm addr:$src)>;
3371 def : Pat<(alignedloadv16i8 addr:$src),
3372 (VMOVDQArm addr:$src)>;
3373 def : Pat<(loadv4i32 addr:$src),
3374 (VMOVDQUrm addr:$src)>;
3375 def : Pat<(loadv8i16 addr:$src),
3376 (VMOVDQUrm addr:$src)>;
3377 def : Pat<(loadv16i8 addr:$src),
3378 (VMOVDQUrm addr:$src)>;
3380 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3381 (VMOVDQAmr addr:$dst, VR128:$src)>;
3382 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3383 (VMOVDQAmr addr:$dst, VR128:$src)>;
3384 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3385 (VMOVDQAmr addr:$dst, VR128:$src)>;
3386 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3387 (VMOVDQUmr addr:$dst, VR128:$src)>;
3388 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3389 (VMOVDQUmr addr:$dst, VR128:$src)>;
3390 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3391 (VMOVDQUmr addr:$dst, VR128:$src)>;
3394 //===---------------------------------------------------------------------===//
3395 // SSE2 - Packed Integer Arithmetic Instructions
3396 //===---------------------------------------------------------------------===//
3398 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3400 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3401 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3402 ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3403 PatFrag memop_frag, X86MemOperand x86memop,
3404 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3405 let isCommutable = 1 in
3406 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3407 (ins RC:$src1, RC:$src2),
3409 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3410 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3411 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3413 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3414 (ins RC:$src1, x86memop:$src2),
3416 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3417 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3418 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3419 (memop_frag addr:$src2))))]>,
3420 Sched<[sched.Folded, sched.ReadAfterFold]>;
3422 } // ExeDomain = SSEPackedInt
3424 defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3425 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3426 defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3427 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3428 defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3429 SchedWriteVecALU, 1, NoVLX>;
3430 defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3431 SchedWriteVecALU, 1, NoVLX>;
3432 defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3433 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3434 defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3435 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3436 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3437 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3438 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3439 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3440 defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3441 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3442 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3443 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3444 defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3445 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3446 defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3447 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3448 defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3449 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3450 defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3451 SchedWriteVecALU, 0, NoVLX>;
3452 defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3453 SchedWriteVecALU, 0, NoVLX>;
3454 defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3455 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3456 defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3457 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3458 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3459 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3460 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3461 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3462 defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3463 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3464 defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3465 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3466 defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3467 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3468 defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3469 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3470 defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3471 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3472 defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3473 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3474 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3475 SchedWriteVecIMul, 1, NoVLX>;
3477 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3478 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3479 load, i128mem, SchedWriteVecIMul.XMM, 0>,
3482 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3483 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3484 VR256, load, i256mem, SchedWriteVecIMul.YMM,
3485 0>, VEX_4V, VEX_L, VEX_WIG;
3486 let Constraints = "$src1 = $dst" in
3487 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3488 memop, i128mem, SchedWriteVecIMul.XMM>;
3490 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3491 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3492 load, i128mem, SchedWritePSADBW.XMM, 0>,
3494 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3495 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3496 load, i256mem, SchedWritePSADBW.YMM, 0>,
3497 VEX_4V, VEX_L, VEX_WIG;
3498 let Constraints = "$src1 = $dst" in
3499 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3500 memop, i128mem, SchedWritePSADBW.XMM>;
3502 //===---------------------------------------------------------------------===//
3503 // SSE2 - Packed Integer Logical Instructions
3504 //===---------------------------------------------------------------------===//
3506 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3507 string OpcodeStr, SDNode OpNode,
3508 SDNode OpNode2, RegisterClass RC,
3509 X86FoldableSchedWrite sched,
3510 X86FoldableSchedWrite schedImm,
3511 ValueType DstVT, ValueType SrcVT,
3512 PatFrag ld_frag, bit Is2Addr = 1> {
3513 // src2 is always 128-bit
3514 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3515 (ins RC:$src1, VR128:$src2),
3517 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3518 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3519 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3521 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3522 (ins RC:$src1, i128mem:$src2),
3524 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3525 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3526 [(set RC:$dst, (DstVT (OpNode RC:$src1,
3527 (SrcVT (ld_frag addr:$src2)))))]>,
3528 Sched<[sched.Folded, sched.ReadAfterFold]>;
3529 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3530 (ins RC:$src1, u8imm:$src2),
3532 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3533 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3534 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3538 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3539 string OpcodeStr, SDNode OpNode,
3540 SDNode OpNode2, ValueType DstVT128,
3541 ValueType DstVT256, ValueType SrcVT,
3542 X86SchedWriteWidths sched,
3543 X86SchedWriteWidths schedImm, Predicate prd> {
3544 let Predicates = [HasAVX, prd] in
3545 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3546 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3547 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3548 let Predicates = [HasAVX2, prd] in
3549 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3550 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3551 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3553 let Constraints = "$src1 = $dst" in
3554 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3555 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3559 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3560 SDNode OpNode, RegisterClass RC, ValueType VT,
3561 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3562 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3564 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3565 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3566 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3570 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3571 SDNode OpNode, X86SchedWriteWidths sched> {
3572 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3573 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3574 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3575 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3576 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3577 VR256, v32i8, sched.YMM, 0>,
3578 VEX_4V, VEX_L, VEX_WIG;
3579 let Constraints = "$src1 = $dst" in
3580 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3584 let ExeDomain = SSEPackedInt in {
3585 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3586 v8i16, v16i16, v8i16, SchedWriteVecShift,
3587 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3588 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3589 v4i32, v8i32, v4i32, SchedWriteVecShift,
3590 SchedWriteVecShiftImm, NoVLX>;
3591 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3592 v2i64, v4i64, v2i64, SchedWriteVecShift,
3593 SchedWriteVecShiftImm, NoVLX>;
3595 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3596 v8i16, v16i16, v8i16, SchedWriteVecShift,
3597 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3598 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3599 v4i32, v8i32, v4i32, SchedWriteVecShift,
3600 SchedWriteVecShiftImm, NoVLX>;
3601 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3602 v2i64, v4i64, v2i64, SchedWriteVecShift,
3603 SchedWriteVecShiftImm, NoVLX>;
3605 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3606 v8i16, v16i16, v8i16, SchedWriteVecShift,
3607 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3608 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3609 v4i32, v8i32, v4i32, SchedWriteVecShift,
3610 SchedWriteVecShiftImm, NoVLX>;
3612 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3614 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3616 } // ExeDomain = SSEPackedInt
3618 //===---------------------------------------------------------------------===//
3619 // SSE2 - Packed Integer Comparison Instructions
3620 //===---------------------------------------------------------------------===//
3622 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3623 SchedWriteVecALU, 1, TruePredicate>;
3624 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3625 SchedWriteVecALU, 1, TruePredicate>;
3626 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3627 SchedWriteVecALU, 1, TruePredicate>;
3628 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3629 SchedWriteVecALU, 0, TruePredicate>;
3630 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3631 SchedWriteVecALU, 0, TruePredicate>;
3632 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3633 SchedWriteVecALU, 0, TruePredicate>;
3635 //===---------------------------------------------------------------------===//
3636 // SSE2 - Packed Integer Shuffle Instructions
3637 //===---------------------------------------------------------------------===//
3639 let ExeDomain = SSEPackedInt in {
3640 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3641 SDNode OpNode, X86SchedWriteWidths sched,
3643 let Predicates = [HasAVX, prd] in {
3644 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3645 (ins VR128:$src1, u8imm:$src2),
3646 !strconcat("v", OpcodeStr,
3647 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3649 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3650 VEX, Sched<[sched.XMM]>, VEX_WIG;
3651 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3652 (ins i128mem:$src1, u8imm:$src2),
3653 !strconcat("v", OpcodeStr,
3654 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3656 (vt128 (OpNode (load addr:$src1),
3657 (i8 timm:$src2))))]>, VEX,
3658 Sched<[sched.XMM.Folded]>, VEX_WIG;
3661 let Predicates = [HasAVX2, prd] in {
3662 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3663 (ins VR256:$src1, u8imm:$src2),
3664 !strconcat("v", OpcodeStr,
3665 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3667 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3668 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3669 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3670 (ins i256mem:$src1, u8imm:$src2),
3671 !strconcat("v", OpcodeStr,
3672 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3674 (vt256 (OpNode (load addr:$src1),
3675 (i8 timm:$src2))))]>, VEX, VEX_L,
3676 Sched<[sched.YMM.Folded]>, VEX_WIG;
3679 let Predicates = [UseSSE2] in {
3680 def ri : Ii8<0x70, MRMSrcReg,
3681 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3682 !strconcat(OpcodeStr,
3683 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3685 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3687 def mi : Ii8<0x70, MRMSrcMem,
3688 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3689 !strconcat(OpcodeStr,
3690 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3692 (vt128 (OpNode (memop addr:$src1),
3693 (i8 timm:$src2))))]>,
3694 Sched<[sched.XMM.Folded]>;
3697 } // ExeDomain = SSEPackedInt
3699 defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3700 SchedWriteShuffle, NoVLX>, PD;
3701 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3702 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3703 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3704 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3706 //===---------------------------------------------------------------------===//
3707 // Packed Integer Pack Instructions (SSE & AVX)
3708 //===---------------------------------------------------------------------===//
3710 let ExeDomain = SSEPackedInt in {
3711 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3712 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3713 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3714 PatFrag ld_frag, bit Is2Addr = 1> {
3715 def rr : PDI<opc, MRMSrcReg,
3716 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3718 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3719 !strconcat(OpcodeStr,
3720 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3722 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3724 def rm : PDI<opc, MRMSrcMem,
3725 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3727 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3728 !strconcat(OpcodeStr,
3729 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3731 (OutVT (OpNode (ArgVT RC:$src1),
3732 (ld_frag addr:$src2))))]>,
3733 Sched<[sched.Folded, sched.ReadAfterFold]>;
3736 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3737 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3738 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3739 PatFrag ld_frag, bit Is2Addr = 1> {
3740 def rr : SS48I<opc, MRMSrcReg,
3741 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3743 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3744 !strconcat(OpcodeStr,
3745 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3747 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3749 def rm : SS48I<opc, MRMSrcMem,
3750 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3752 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3753 !strconcat(OpcodeStr,
3754 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3756 (OutVT (OpNode (ArgVT RC:$src1),
3757 (ld_frag addr:$src2))))]>,
3758 Sched<[sched.Folded, sched.ReadAfterFold]>;
3761 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3762 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3763 i128mem, SchedWriteShuffle.XMM, load, 0>,
3765 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3766 i128mem, SchedWriteShuffle.XMM, load, 0>,
3769 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3770 i128mem, SchedWriteShuffle.XMM, load, 0>,
3772 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3773 i128mem, SchedWriteShuffle.XMM, load, 0>,
3777 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3778 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3779 i256mem, SchedWriteShuffle.YMM, load, 0>,
3780 VEX_4V, VEX_L, VEX_WIG;
3781 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3782 i256mem, SchedWriteShuffle.YMM, load, 0>,
3783 VEX_4V, VEX_L, VEX_WIG;
3785 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3786 i256mem, SchedWriteShuffle.YMM, load, 0>,
3787 VEX_4V, VEX_L, VEX_WIG;
3788 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3789 i256mem, SchedWriteShuffle.YMM, load, 0>,
3793 let Constraints = "$src1 = $dst" in {
3794 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3795 i128mem, SchedWriteShuffle.XMM, memop>;
3796 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3797 i128mem, SchedWriteShuffle.XMM, memop>;
3799 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3800 i128mem, SchedWriteShuffle.XMM, memop>;
3802 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3803 i128mem, SchedWriteShuffle.XMM, memop>;
3805 } // ExeDomain = SSEPackedInt
3807 //===---------------------------------------------------------------------===//
3808 // SSE2 - Packed Integer Unpack Instructions
3809 //===---------------------------------------------------------------------===//
3811 let ExeDomain = SSEPackedInt in {
3812 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3813 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3814 X86FoldableSchedWrite sched, PatFrag ld_frag,
3816 def rr : PDI<opc, MRMSrcReg,
3817 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3819 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3820 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3821 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3823 def rm : PDI<opc, MRMSrcMem,
3824 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3826 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3827 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3828 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3829 Sched<[sched.Folded, sched.ReadAfterFold]>;
3832 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3833 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3834 i128mem, SchedWriteShuffle.XMM, load, 0>,
3836 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3837 i128mem, SchedWriteShuffle.XMM, load, 0>,
3839 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3840 i128mem, SchedWriteShuffle.XMM, load, 0>,
3842 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3843 i128mem, SchedWriteShuffle.XMM, load, 0>,
3847 let Predicates = [HasAVX, NoVLX] in {
3848 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3849 i128mem, SchedWriteShuffle.XMM, load, 0>,
3851 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3852 i128mem, SchedWriteShuffle.XMM, load, 0>,
3854 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3855 i128mem, SchedWriteShuffle.XMM, load, 0>,
3857 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3858 i128mem, SchedWriteShuffle.XMM, load, 0>,
3862 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3863 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3864 i256mem, SchedWriteShuffle.YMM, load, 0>,
3865 VEX_4V, VEX_L, VEX_WIG;
3866 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3867 i256mem, SchedWriteShuffle.YMM, load, 0>,
3868 VEX_4V, VEX_L, VEX_WIG;
3869 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3870 i256mem, SchedWriteShuffle.YMM, load, 0>,
3871 VEX_4V, VEX_L, VEX_WIG;
3872 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3873 i256mem, SchedWriteShuffle.YMM, load, 0>,
3874 VEX_4V, VEX_L, VEX_WIG;
3877 let Predicates = [HasAVX2, NoVLX] in {
3878 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3879 i256mem, SchedWriteShuffle.YMM, load, 0>,
3880 VEX_4V, VEX_L, VEX_WIG;
3881 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3882 i256mem, SchedWriteShuffle.YMM, load, 0>,
3883 VEX_4V, VEX_L, VEX_WIG;
3884 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3885 i256mem, SchedWriteShuffle.YMM, load, 0>,
3886 VEX_4V, VEX_L, VEX_WIG;
3887 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3888 i256mem, SchedWriteShuffle.YMM, load, 0>,
3889 VEX_4V, VEX_L, VEX_WIG;
3892 let Constraints = "$src1 = $dst" in {
3893 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3894 i128mem, SchedWriteShuffle.XMM, memop>;
3895 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3896 i128mem, SchedWriteShuffle.XMM, memop>;
3897 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3898 i128mem, SchedWriteShuffle.XMM, memop>;
3899 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3900 i128mem, SchedWriteShuffle.XMM, memop>;
3902 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3903 i128mem, SchedWriteShuffle.XMM, memop>;
3904 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3905 i128mem, SchedWriteShuffle.XMM, memop>;
3906 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3907 i128mem, SchedWriteShuffle.XMM, memop>;
3908 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3909 i128mem, SchedWriteShuffle.XMM, memop>;
3911 } // ExeDomain = SSEPackedInt
3913 //===---------------------------------------------------------------------===//
3914 // SSE2 - Packed Integer Extract and Insert
3915 //===---------------------------------------------------------------------===//
3917 let ExeDomain = SSEPackedInt in {
3918 multiclass sse2_pinsrw<bit Is2Addr = 1> {
3919 def rr : Ii8<0xC4, MRMSrcReg,
3920 (outs VR128:$dst), (ins VR128:$src1,
3921 GR32orGR64:$src2, u8imm:$src3),
3923 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3924 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3926 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
3927 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3928 def rm : Ii8<0xC4, MRMSrcMem,
3929 (outs VR128:$dst), (ins VR128:$src1,
3930 i16mem:$src2, u8imm:$src3),
3932 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3933 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3935 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3937 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3941 let Predicates = [HasAVX, NoBWI] in
3942 def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3943 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3944 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3945 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3947 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3948 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3949 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3950 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3951 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3953 Sched<[WriteVecExtract]>;
3956 let Predicates = [HasAVX, NoBWI] in
3957 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
3959 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
3960 defm PINSRW : sse2_pinsrw, PD;
3962 } // ExeDomain = SSEPackedInt
3964 //===---------------------------------------------------------------------===//
3965 // SSE2 - Packed Mask Creation
3966 //===---------------------------------------------------------------------===//
3968 let ExeDomain = SSEPackedInt in {
3970 def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3972 "pmovmskb\t{$src, $dst|$dst, $src}",
3973 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3974 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
3976 let Predicates = [HasAVX2] in {
3977 def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3979 "pmovmskb\t{$src, $dst|$dst, $src}",
3980 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
3981 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
3984 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
3985 "pmovmskb\t{$src, $dst|$dst, $src}",
3986 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3987 Sched<[WriteVecMOVMSK]>;
3989 } // ExeDomain = SSEPackedInt
3991 //===---------------------------------------------------------------------===//
3992 // SSE2 - Conditional Store
3993 //===---------------------------------------------------------------------===//
3995 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3996 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
3997 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
3998 (ins VR128:$src, VR128:$mask),
3999 "maskmovdqu\t{$mask, $src|$src, $mask}",
4000 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4002 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4003 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4004 (ins VR128:$src, VR128:$mask),
4005 "maskmovdqu\t{$mask, $src|$src, $mask}",
4006 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4009 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4010 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4011 "maskmovdqu\t{$mask, $src|$src, $mask}",
4012 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4013 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4014 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4015 "maskmovdqu\t{$mask, $src|$src, $mask}",
4016 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4018 } // ExeDomain = SSEPackedInt
4020 //===---------------------------------------------------------------------===//
4021 // SSE2 - Move Doubleword/Quadword
4022 //===---------------------------------------------------------------------===//
4024 //===---------------------------------------------------------------------===//
4025 // Move Int Doubleword to Packed Double Int
4027 let ExeDomain = SSEPackedInt in {
4028 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4029 "movd\t{$src, $dst|$dst, $src}",
4031 (v4i32 (scalar_to_vector GR32:$src)))]>,
4032 VEX, Sched<[WriteVecMoveFromGpr]>;
4033 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4034 "movd\t{$src, $dst|$dst, $src}",
4036 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4037 VEX, Sched<[WriteVecLoad]>;
4038 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4039 "movq\t{$src, $dst|$dst, $src}",
4041 (v2i64 (scalar_to_vector GR64:$src)))]>,
4042 VEX, Sched<[WriteVecMoveFromGpr]>;
4043 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4044 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4045 "movq\t{$src, $dst|$dst, $src}", []>,
4046 VEX, Sched<[WriteVecLoad]>;
4047 let isCodeGenOnly = 1 in
4048 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4049 "movq\t{$src, $dst|$dst, $src}",
4050 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4051 VEX, Sched<[WriteVecMoveFromGpr]>;
4053 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4054 "movd\t{$src, $dst|$dst, $src}",
4056 (v4i32 (scalar_to_vector GR32:$src)))]>,
4057 Sched<[WriteVecMoveFromGpr]>;
4058 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4059 "movd\t{$src, $dst|$dst, $src}",
4061 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4062 Sched<[WriteVecLoad]>;
4063 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4064 "movq\t{$src, $dst|$dst, $src}",
4066 (v2i64 (scalar_to_vector GR64:$src)))]>,
4067 Sched<[WriteVecMoveFromGpr]>;
4068 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4069 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4070 "movq\t{$src, $dst|$dst, $src}", []>,
4071 Sched<[WriteVecLoad]>;
4072 let isCodeGenOnly = 1 in
4073 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4074 "movq\t{$src, $dst|$dst, $src}",
4075 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4076 Sched<[WriteVecMoveFromGpr]>;
4077 } // ExeDomain = SSEPackedInt
4079 //===---------------------------------------------------------------------===//
4080 // Move Int Doubleword to Single Scalar
4082 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4083 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4084 "movd\t{$src, $dst|$dst, $src}",
4085 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4086 VEX, Sched<[WriteVecMoveFromGpr]>;
4088 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4089 "movd\t{$src, $dst|$dst, $src}",
4090 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4091 Sched<[WriteVecMoveFromGpr]>;
4093 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4095 //===---------------------------------------------------------------------===//
4096 // Move Packed Doubleword Int to Packed Double Int
4098 let ExeDomain = SSEPackedInt in {
4099 def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4100 "movd\t{$src, $dst|$dst, $src}",
4101 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4103 Sched<[WriteVecMoveToGpr]>;
4104 def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
4105 (ins i32mem:$dst, VR128:$src),
4106 "movd\t{$src, $dst|$dst, $src}",
4107 [(store (i32 (extractelt (v4i32 VR128:$src),
4108 (iPTR 0))), addr:$dst)]>,
4109 VEX, Sched<[WriteVecStore]>;
4110 def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4111 "movd\t{$src, $dst|$dst, $src}",
4112 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4114 Sched<[WriteVecMoveToGpr]>;
4115 def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4116 "movd\t{$src, $dst|$dst, $src}",
4117 [(store (i32 (extractelt (v4i32 VR128:$src),
4118 (iPTR 0))), addr:$dst)]>,
4119 Sched<[WriteVecStore]>;
4120 } // ExeDomain = SSEPackedInt
4122 //===---------------------------------------------------------------------===//
4123 // Move Packed Doubleword Int first element to Doubleword Int
4125 let ExeDomain = SSEPackedInt in {
4126 let SchedRW = [WriteVecMoveToGpr] in {
4127 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4128 "movq\t{$src, $dst|$dst, $src}",
4129 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4133 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4134 "movq\t{$src, $dst|$dst, $src}",
4135 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4139 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4140 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4141 (ins i64mem:$dst, VR128:$src),
4142 "movq\t{$src, $dst|$dst, $src}", []>,
4143 VEX, Sched<[WriteVecStore]>;
4144 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4145 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4146 "movq\t{$src, $dst|$dst, $src}", []>,
4147 Sched<[WriteVecStore]>;
4148 } // ExeDomain = SSEPackedInt
4150 //===---------------------------------------------------------------------===//
4151 // Bitcast FR64 <-> GR64
4153 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4154 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4155 "movq\t{$src, $dst|$dst, $src}",
4156 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4157 VEX, Sched<[WriteVecMoveToGpr]>;
4159 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4160 "movq\t{$src, $dst|$dst, $src}",
4161 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4162 Sched<[WriteVecMoveToGpr]>;
4163 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4165 //===---------------------------------------------------------------------===//
4166 // Move Scalar Single to Double Int
4168 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4169 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4170 "movd\t{$src, $dst|$dst, $src}",
4171 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4172 VEX, Sched<[WriteVecMoveToGpr]>;
4173 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4174 "movd\t{$src, $dst|$dst, $src}",
4175 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4176 Sched<[WriteVecMoveToGpr]>;
4177 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4179 let Predicates = [UseAVX] in {
4180 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4181 (VMOVDI2PDIrr GR32:$src)>;
4183 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4184 (VMOV64toPQIrr GR64:$src)>;
4186 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4187 // These instructions also write zeros in the high part of a 256-bit register.
4188 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4189 (VMOVDI2PDIrm addr:$src)>;
4190 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4191 (VMOVDI2PDIrm addr:$src)>;
4192 def : Pat<(v8i32 (X86vzload32 addr:$src)),
4193 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4196 let Predicates = [UseSSE2] in {
4197 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4198 (MOVDI2PDIrr GR32:$src)>;
4200 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4201 (MOV64toPQIrr GR64:$src)>;
4202 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4203 (MOVDI2PDIrm addr:$src)>;
4204 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4205 (MOVDI2PDIrm addr:$src)>;
4208 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4209 // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4211 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4212 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4213 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4214 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4215 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4216 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4217 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4218 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4219 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4221 //===---------------------------------------------------------------------===//
4222 // SSE2 - Move Quadword
4223 //===---------------------------------------------------------------------===//
4225 //===---------------------------------------------------------------------===//
4226 // Move Quadword Int to Packed Quadword Int
4229 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4230 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4231 "vmovq\t{$src, $dst|$dst, $src}",
4233 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4234 VEX, Requires<[UseAVX]>, VEX_WIG;
4235 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4236 "movq\t{$src, $dst|$dst, $src}",
4238 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4239 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4240 } // ExeDomain, SchedRW
4242 //===---------------------------------------------------------------------===//
4243 // Move Packed Quadword Int to Quadword Int
4245 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4246 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4247 "movq\t{$src, $dst|$dst, $src}",
4248 [(store (i64 (extractelt (v2i64 VR128:$src),
4249 (iPTR 0))), addr:$dst)]>,
4251 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4252 "movq\t{$src, $dst|$dst, $src}",
4253 [(store (i64 (extractelt (v2i64 VR128:$src),
4254 (iPTR 0))), addr:$dst)]>;
4255 } // ExeDomain, SchedRW
4257 // For disassembler only
4258 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4259 SchedRW = [SchedWriteVecLogic.XMM] in {
4260 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4261 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4262 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4263 "movq\t{$src, $dst|$dst, $src}", []>;
4266 def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4267 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4268 def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4269 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4271 let Predicates = [UseAVX] in {
4272 def : Pat<(v2i64 (X86vzload64 addr:$src)),
4273 (VMOVQI2PQIrm addr:$src)>;
4274 def : Pat<(v4i64 (X86vzload64 addr:$src)),
4275 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4277 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4278 (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4281 let Predicates = [UseSSE2] in {
4282 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4284 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4285 (MOVPQI2QImr addr:$dst, VR128:$src)>;
4288 //===---------------------------------------------------------------------===//
4289 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4290 // IA32 document. movq xmm1, xmm2 does clear the high bits.
4292 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4293 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4294 "vmovq\t{$src, $dst|$dst, $src}",
4295 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4296 XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4297 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4298 "movq\t{$src, $dst|$dst, $src}",
4299 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4300 XS, Requires<[UseSSE2]>;
4301 } // ExeDomain, SchedRW
4303 let Predicates = [UseAVX] in {
4304 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4305 (VMOVZPQILo2PQIrr VR128:$src)>;
4307 let Predicates = [UseSSE2] in {
4308 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4309 (MOVZPQILo2PQIrr VR128:$src)>;
4312 let Predicates = [UseAVX] in {
4313 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4314 (SUBREG_TO_REG (i32 0),
4315 (v2f64 (VMOVZPQILo2PQIrr
4316 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4318 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4319 (SUBREG_TO_REG (i32 0),
4320 (v2i64 (VMOVZPQILo2PQIrr
4321 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4325 //===---------------------------------------------------------------------===//
4326 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4327 //===---------------------------------------------------------------------===//
4329 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4330 ValueType vt, RegisterClass RC, PatFrag mem_frag,
4331 X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4332 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4333 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4334 [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4336 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4337 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4338 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4339 Sched<[sched.Folded]>;
4342 let Predicates = [HasAVX, NoVLX] in {
4343 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4344 v4f32, VR128, loadv4f32, f128mem,
4345 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4346 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4347 v4f32, VR128, loadv4f32, f128mem,
4348 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4349 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4350 v8f32, VR256, loadv8f32, f256mem,
4351 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4352 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4353 v8f32, VR256, loadv8f32, f256mem,
4354 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4356 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4357 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4358 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4359 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4361 let Predicates = [HasAVX, NoVLX] in {
4362 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4363 (VMOVSHDUPrr VR128:$src)>;
4364 def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4365 (VMOVSHDUPrm addr:$src)>;
4366 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4367 (VMOVSLDUPrr VR128:$src)>;
4368 def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4369 (VMOVSLDUPrm addr:$src)>;
4370 def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4371 (VMOVSHDUPYrr VR256:$src)>;
4372 def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4373 (VMOVSHDUPYrm addr:$src)>;
4374 def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4375 (VMOVSLDUPYrr VR256:$src)>;
4376 def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4377 (VMOVSLDUPYrm addr:$src)>;
4380 let Predicates = [UseSSE3] in {
4381 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4382 (MOVSHDUPrr VR128:$src)>;
4383 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4384 (MOVSHDUPrm addr:$src)>;
4385 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4386 (MOVSLDUPrr VR128:$src)>;
4387 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4388 (MOVSLDUPrm addr:$src)>;
4391 //===---------------------------------------------------------------------===//
4392 // SSE3 - Replicate Double FP - MOVDDUP
4393 //===---------------------------------------------------------------------===//
4395 multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4396 def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4397 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4398 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4400 def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4401 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4404 (scalar_to_vector (loadf64 addr:$src)))))]>,
4405 Sched<[sched.XMM.Folded]>;
4408 // FIXME: Merge with above classes when there are patterns for the ymm version
4409 multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4410 def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4411 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4412 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4414 def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4415 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4417 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4418 Sched<[sched.YMM.Folded]>;
4421 let Predicates = [HasAVX, NoVLX] in {
4422 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4424 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4425 VEX, VEX_L, VEX_WIG;
4428 defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4431 let Predicates = [HasAVX, NoVLX] in {
4432 def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
4433 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4434 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4435 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4438 let Predicates = [UseSSE3] in {
4439 // No need for aligned memory as this only loads 64-bits.
4440 def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
4441 (MOVDDUPrm addr:$src)>;
4442 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4443 (MOVDDUPrm addr:$src)>;
4446 //===---------------------------------------------------------------------===//
4447 // SSE3 - Move Unaligned Integer
4448 //===---------------------------------------------------------------------===//
4450 let Predicates = [HasAVX] in {
4451 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4452 "vlddqu\t{$src, $dst|$dst, $src}",
4453 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4454 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4455 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4456 "vlddqu\t{$src, $dst|$dst, $src}",
4457 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4458 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4461 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4462 "lddqu\t{$src, $dst|$dst, $src}",
4463 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4464 Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4466 //===---------------------------------------------------------------------===//
4467 // SSE3 - Arithmetic
4468 //===---------------------------------------------------------------------===//
4470 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4471 X86MemOperand x86memop, X86FoldableSchedWrite sched,
4472 PatFrag ld_frag, bit Is2Addr = 1> {
4473 let Uses = [MXCSR], mayRaiseFPException = 1 in {
4474 def rr : I<0xD0, MRMSrcReg,
4475 (outs RC:$dst), (ins RC:$src1, RC:$src2),
4477 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4478 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4479 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4481 def rm : I<0xD0, MRMSrcMem,
4482 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4484 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4485 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4486 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4487 Sched<[sched.Folded, sched.ReadAfterFold]>;
4491 let Predicates = [HasAVX] in {
4492 let ExeDomain = SSEPackedSingle in {
4493 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4494 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4495 XD, VEX_4V, VEX_WIG;
4496 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4497 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4498 XD, VEX_4V, VEX_L, VEX_WIG;
4500 let ExeDomain = SSEPackedDouble in {
4501 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4502 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4503 PD, VEX_4V, VEX_WIG;
4504 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4505 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4506 PD, VEX_4V, VEX_L, VEX_WIG;
4509 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4510 let ExeDomain = SSEPackedSingle in
4511 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4512 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4513 let ExeDomain = SSEPackedDouble in
4514 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4515 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4518 //===---------------------------------------------------------------------===//
4519 // SSE3 Instructions
4520 //===---------------------------------------------------------------------===//
4523 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4524 X86MemOperand x86memop, SDNode OpNode,
4525 X86FoldableSchedWrite sched, PatFrag ld_frag,
4527 let Uses = [MXCSR], mayRaiseFPException = 1 in {
4528 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4530 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4531 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4532 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4535 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4537 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4538 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4539 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4540 Sched<[sched.Folded, sched.ReadAfterFold]>;
4543 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4544 X86MemOperand x86memop, SDNode OpNode,
4545 X86FoldableSchedWrite sched, PatFrag ld_frag,
4547 let Uses = [MXCSR], mayRaiseFPException = 1 in {
4548 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4550 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4551 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4552 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4555 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4557 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4558 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4559 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4560 Sched<[sched.Folded, sched.ReadAfterFold]>;
4564 let Predicates = [HasAVX] in {
4565 let ExeDomain = SSEPackedSingle in {
4566 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4567 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4568 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4569 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4570 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4571 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4572 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4573 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4575 let ExeDomain = SSEPackedDouble in {
4576 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4577 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4578 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4579 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4580 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4581 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4582 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4583 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4587 let Constraints = "$src1 = $dst" in {
4588 let ExeDomain = SSEPackedSingle in {
4589 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4590 WriteFHAdd, memopv4f32>;
4591 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4592 WriteFHAdd, memopv4f32>;
4594 let ExeDomain = SSEPackedDouble in {
4595 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4596 WriteFHAdd, memopv2f64>;
4597 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4598 WriteFHAdd, memopv2f64>;
4602 //===---------------------------------------------------------------------===//
4603 // SSSE3 - Packed Absolute Instructions
4604 //===---------------------------------------------------------------------===//
4606 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4607 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4608 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4609 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4611 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4612 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4615 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4617 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4619 (vt (OpNode (ld_frag addr:$src))))]>,
4620 Sched<[sched.XMM.Folded]>;
4623 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4624 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4625 SDNode OpNode, X86SchedWriteWidths sched> {
4626 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4628 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4629 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4632 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4634 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4636 (vt (OpNode (load addr:$src))))]>,
4637 Sched<[sched.YMM.Folded]>;
4640 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4641 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4642 load>, VEX, VEX_WIG;
4643 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4644 load>, VEX, VEX_WIG;
4646 let Predicates = [HasAVX, NoVLX] in {
4647 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4648 load>, VEX, VEX_WIG;
4650 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4651 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4652 VEX, VEX_L, VEX_WIG;
4653 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4654 VEX, VEX_L, VEX_WIG;
4656 let Predicates = [HasAVX2, NoVLX] in {
4657 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4658 VEX, VEX_L, VEX_WIG;
4661 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4663 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4665 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4668 //===---------------------------------------------------------------------===//
4669 // SSSE3 - Packed Binary Operator Instructions
4670 //===---------------------------------------------------------------------===//
4672 /// SS3I_binop_rm - Simple SSSE3 bin op
4673 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4674 ValueType DstVT, ValueType OpVT, RegisterClass RC,
4675 PatFrag memop_frag, X86MemOperand x86memop,
4676 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4677 let isCommutable = 1 in
4678 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4679 (ins RC:$src1, RC:$src2),
4681 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4682 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4683 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4685 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4686 (ins RC:$src1, x86memop:$src2),
4688 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4689 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4691 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4692 Sched<[sched.Folded, sched.ReadAfterFold]>;
4695 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4696 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4697 Intrinsic IntId128, X86FoldableSchedWrite sched,
4698 PatFrag ld_frag, bit Is2Addr = 1> {
4699 let isCommutable = 1 in
4700 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4701 (ins VR128:$src1, VR128:$src2),
4703 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4704 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4705 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4707 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4708 (ins VR128:$src1, i128mem:$src2),
4710 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4711 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4713 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4714 Sched<[sched.Folded, sched.ReadAfterFold]>;
4717 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4719 X86FoldableSchedWrite sched> {
4720 let isCommutable = 1 in
4721 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4722 (ins VR256:$src1, VR256:$src2),
4723 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4724 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4726 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4727 (ins VR256:$src1, i256mem:$src2),
4728 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4730 (IntId256 VR256:$src1, (load addr:$src2)))]>,
4731 Sched<[sched.Folded, sched.ReadAfterFold]>;
4734 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4735 let isCommutable = 0 in {
4736 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4737 VR128, load, i128mem,
4738 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4739 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4740 v16i8, VR128, load, i128mem,
4741 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4743 defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4744 VR128, load, i128mem,
4745 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4748 let ImmT = NoImm, Predicates = [HasAVX] in {
4749 let isCommutable = 0 in {
4750 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4752 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4753 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4755 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4756 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4758 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4759 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4761 SchedWritePHAdd.XMM, 0>, VEX_4V;
4762 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
4763 int_x86_ssse3_psign_b_128,
4764 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4765 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
4766 int_x86_ssse3_psign_w_128,
4767 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4768 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
4769 int_x86_ssse3_psign_d_128,
4770 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4771 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
4772 int_x86_ssse3_phadd_sw_128,
4773 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4774 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
4775 int_x86_ssse3_phsub_sw_128,
4776 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4780 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4781 let isCommutable = 0 in {
4782 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4783 VR256, load, i256mem,
4784 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4785 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4786 v32i8, VR256, load, i256mem,
4787 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4789 defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4790 VR256, load, i256mem,
4791 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4794 let ImmT = NoImm, Predicates = [HasAVX2] in {
4795 let isCommutable = 0 in {
4796 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4797 VR256, load, i256mem,
4798 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4799 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4801 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4802 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4803 VR256, load, i256mem,
4804 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4805 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4807 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
4808 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4809 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4810 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4811 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4812 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4813 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4814 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4815 int_x86_avx2_phadd_sw,
4816 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4817 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4818 int_x86_avx2_phsub_sw,
4819 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4823 // None of these have i8 immediate fields.
4824 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4825 let isCommutable = 0 in {
4826 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4827 memop, i128mem, SchedWritePHAdd.XMM>;
4828 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4829 memop, i128mem, SchedWritePHAdd.XMM>;
4830 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4831 memop, i128mem, SchedWritePHAdd.XMM>;
4832 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4833 memop, i128mem, SchedWritePHAdd.XMM>;
4834 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4835 SchedWriteVecALU.XMM, memop>;
4836 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4837 SchedWriteVecALU.XMM, memop>;
4838 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4839 SchedWriteVecALU.XMM, memop>;
4840 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4841 memop, i128mem, SchedWriteVarShuffle.XMM>;
4842 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
4843 int_x86_ssse3_phadd_sw_128,
4844 SchedWritePHAdd.XMM, memop>;
4845 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
4846 int_x86_ssse3_phsub_sw_128,
4847 SchedWritePHAdd.XMM, memop>;
4848 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4849 v16i8, VR128, memop, i128mem,
4850 SchedWriteVecIMul.XMM>;
4852 defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4853 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4856 //===---------------------------------------------------------------------===//
4857 // SSSE3 - Packed Align Instruction Patterns
4858 //===---------------------------------------------------------------------===//
4860 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4861 PatFrag memop_frag, X86MemOperand x86memop,
4862 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4863 let hasSideEffects = 0 in {
4864 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4865 (ins RC:$src1, RC:$src2, u8imm:$src3),
4867 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4869 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4870 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4873 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4874 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4876 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4878 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4879 [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4880 (memop_frag addr:$src2),
4881 (i8 timm:$src3))))]>,
4882 Sched<[sched.Folded, sched.ReadAfterFold]>;
4886 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4887 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4888 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4889 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4890 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4891 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4892 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4893 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4894 SchedWriteShuffle.XMM>;
4896 //===---------------------------------------------------------------------===//
4897 // SSSE3 - Thread synchronization
4898 //===---------------------------------------------------------------------===//
4900 let SchedRW = [WriteSystem] in {
4901 let Uses = [EAX, ECX, EDX] in
4902 def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4903 TB, Requires<[HasSSE3, Not64BitMode]>;
4904 let Uses = [RAX, ECX, EDX] in
4905 def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4906 TB, Requires<[HasSSE3, In64BitMode]>;
4908 let Uses = [ECX, EAX] in
4909 def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
4910 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4913 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4914 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4916 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4917 Requires<[Not64BitMode]>;
4918 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4919 Requires<[In64BitMode]>;
4921 //===----------------------------------------------------------------------===//
4922 // SSE4.1 - Packed Move with Sign/Zero Extend
4923 // NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4924 //===----------------------------------------------------------------------===//
4926 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4927 RegisterClass OutRC, RegisterClass InRC,
4928 X86FoldableSchedWrite sched> {
4929 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4930 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4933 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4934 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4935 Sched<[sched.Folded]>;
4938 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4939 X86MemOperand MemOp, X86MemOperand MemYOp,
4941 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
4942 SchedWriteShuffle.XMM>;
4943 let Predicates = [HasAVX, prd] in
4944 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
4945 VR128, VR128, SchedWriteShuffle.XMM>,
4947 let Predicates = [HasAVX2, prd] in
4948 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
4949 VR256, VR128, WriteShuffle256>,
4950 VEX, VEX_L, VEX_WIG;
4953 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4954 X86MemOperand MemYOp, Predicate prd> {
4955 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
4956 MemOp, MemYOp, prd>;
4957 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
4958 !strconcat("pmovzx", OpcodeStr),
4959 MemOp, MemYOp, prd>;
4962 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
4963 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
4964 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
4966 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
4967 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
4969 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
4972 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
4973 SDNode ExtOp, SDNode InVecOp> {
4974 // Register-Register patterns
4975 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4976 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
4977 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
4979 let Predicates = [HasAVX2, NoVLX] in {
4980 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
4981 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
4982 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
4983 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
4985 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
4986 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
4987 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
4988 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
4990 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
4991 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
4994 // Simple Register-Memory patterns
4995 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4996 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4997 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4999 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5000 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5003 let Predicates = [HasAVX2, NoVLX] in {
5004 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5005 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5006 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5007 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5009 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5010 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5011 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5012 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5014 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5015 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5018 // AVX2 Register-Memory patterns
5019 let Predicates = [HasAVX2, NoVLX] in {
5020 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5021 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5023 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5024 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5025 def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
5026 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5028 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5029 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5031 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5032 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5033 def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
5034 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5036 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5037 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5038 def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
5039 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5043 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5044 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5046 // SSE4.1/AVX patterns.
5047 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5049 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5050 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5051 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5053 let Predicates = [HasAVX, NoVLX] in {
5054 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5055 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5056 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5057 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5059 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5060 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5061 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5062 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5064 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5065 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5067 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5068 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5069 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5071 let Predicates = [HasAVX, NoVLX] in {
5072 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5073 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5074 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5075 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5077 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5078 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5079 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5080 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5082 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5083 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5085 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5086 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5087 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5088 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5089 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5090 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5091 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5092 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5093 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5095 let Predicates = [HasAVX, NoVLX] in {
5096 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5097 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5098 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5099 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5100 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5101 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5103 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5104 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5105 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5106 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5108 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5109 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5110 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5111 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5112 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5113 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5114 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5115 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5117 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5118 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5119 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5120 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5121 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5122 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5124 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5125 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5126 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5127 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5128 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5129 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5130 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5131 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5135 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5136 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5138 let Predicates = [UseSSE41] in {
5139 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5140 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5143 //===----------------------------------------------------------------------===//
5144 // SSE4.1 - Extract Instructions
5145 //===----------------------------------------------------------------------===//
5147 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5148 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5149 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5150 (ins VR128:$src1, u8imm:$src2),
5151 !strconcat(OpcodeStr,
5152 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5153 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5155 Sched<[WriteVecExtract]>;
5156 let hasSideEffects = 0, mayStore = 1 in
5157 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5158 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5159 !strconcat(OpcodeStr,
5160 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5161 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5162 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5165 let Predicates = [HasAVX, NoBWI] in
5166 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5168 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
5171 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5172 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5173 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5174 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5175 (ins VR128:$src1, u8imm:$src2),
5176 !strconcat(OpcodeStr,
5177 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5178 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5180 let hasSideEffects = 0, mayStore = 1 in
5181 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5182 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5183 !strconcat(OpcodeStr,
5184 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5185 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5186 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5189 let Predicates = [HasAVX, NoBWI] in
5190 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5192 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
5195 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5196 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5197 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5198 (ins VR128:$src1, u8imm:$src2),
5199 !strconcat(OpcodeStr,
5200 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5202 (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5203 Sched<[WriteVecExtract]>;
5204 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5205 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5206 !strconcat(OpcodeStr,
5207 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5208 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5209 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5212 let Predicates = [HasAVX, NoDQI] in
5213 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5215 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
5217 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5218 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5219 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5220 (ins VR128:$src1, u8imm:$src2),
5221 !strconcat(OpcodeStr,
5222 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5224 (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5225 Sched<[WriteVecExtract]>;
5226 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5227 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5228 !strconcat(OpcodeStr,
5229 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5230 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5231 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5234 let Predicates = [HasAVX, NoDQI] in
5235 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5237 defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
5239 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5241 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5242 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5243 (ins VR128:$src1, u8imm:$src2),
5244 !strconcat(OpcodeStr,
5245 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5246 [(set GR32orGR64:$dst,
5247 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5248 Sched<[WriteVecExtract]>;
5249 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5250 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5251 !strconcat(OpcodeStr,
5252 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5253 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5254 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5257 let ExeDomain = SSEPackedSingle in {
5258 let Predicates = [UseAVX] in
5259 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5260 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
5263 //===----------------------------------------------------------------------===//
5264 // SSE4.1 - Insert Instructions
5265 //===----------------------------------------------------------------------===//
5267 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5268 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5269 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5271 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5273 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5275 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5276 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5277 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5278 (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5280 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5282 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5284 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
5285 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5288 let Predicates = [HasAVX, NoBWI] in
5289 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5290 let Constraints = "$src1 = $dst" in
5291 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
5293 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5294 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5295 (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5297 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5299 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5301 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5302 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5303 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5304 (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5306 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5308 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5310 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5311 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5314 let Predicates = [HasAVX, NoDQI] in
5315 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5316 let Constraints = "$src1 = $dst" in
5317 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5319 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5320 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5321 (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5323 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5325 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5327 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5328 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5329 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5330 (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5332 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5334 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5336 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5337 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5340 let Predicates = [HasAVX, NoDQI] in
5341 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5342 let Constraints = "$src1 = $dst" in
5343 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5345 // insertps has a few different modes, there's the first two here below which
5346 // are optimized inserts that won't zero arbitrary elements in the destination
5347 // vector. The next one matches the intrinsic and could zero arbitrary elements
5348 // in the target vector.
5349 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5350 let isCommutable = 1 in
5351 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5352 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5354 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5356 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5358 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5359 Sched<[SchedWriteFShuffle.XMM]>;
5360 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5361 (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5363 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5365 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5367 (X86insertps VR128:$src1,
5368 (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5370 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5373 let ExeDomain = SSEPackedSingle in {
5374 let Predicates = [UseAVX] in
5375 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5377 let Constraints = "$src1 = $dst" in
5378 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5381 //===----------------------------------------------------------------------===//
5382 // SSE4.1 - Round Instructions
5383 //===----------------------------------------------------------------------===//
5385 multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5386 X86MemOperand x86memop, RegisterClass RC,
5387 ValueType VT, PatFrag mem_frag, SDNode OpNode,
5388 X86FoldableSchedWrite sched> {
5389 // Intrinsic operation, reg.
5390 // Vector intrinsic operation, reg
5391 let Uses = [MXCSR], mayRaiseFPException = 1 in {
5392 def r : SS4AIi8<opc, MRMSrcReg,
5393 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5394 !strconcat(OpcodeStr,
5395 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5396 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5399 // Vector intrinsic operation, mem
5400 def m : SS4AIi8<opc, MRMSrcMem,
5401 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5402 !strconcat(OpcodeStr,
5403 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5405 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5406 Sched<[sched.Folded]>;
5410 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5411 string OpcodeStr, X86FoldableSchedWrite sched> {
5412 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5413 def SSr : SS4AIi8<opcss, MRMSrcReg,
5414 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5415 !strconcat(OpcodeStr,
5416 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5417 []>, Sched<[sched]>;
5420 def SSm : SS4AIi8<opcss, MRMSrcMem,
5421 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5422 !strconcat(OpcodeStr,
5423 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5424 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5425 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5427 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5428 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5429 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5430 !strconcat(OpcodeStr,
5431 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5432 []>, Sched<[sched]>;
5435 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5436 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5437 !strconcat(OpcodeStr,
5438 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5439 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5440 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5443 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5444 string OpcodeStr, X86FoldableSchedWrite sched> {
5445 let Uses = [MXCSR], mayRaiseFPException = 1 in {
5446 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5447 def SSr : SS4AIi8<opcss, MRMSrcReg,
5448 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5449 !strconcat(OpcodeStr,
5450 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5451 []>, Sched<[sched]>;
5454 def SSm : SS4AIi8<opcss, MRMSrcMem,
5455 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5456 !strconcat(OpcodeStr,
5457 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5458 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5459 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5461 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5462 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5463 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5464 !strconcat(OpcodeStr,
5465 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5466 []>, Sched<[sched]>;
5469 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5470 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5471 !strconcat(OpcodeStr,
5472 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5473 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5474 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5478 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5479 string OpcodeStr, X86FoldableSchedWrite sched,
5480 ValueType VT32, ValueType VT64,
5481 SDNode OpNode, bit Is2Addr = 1> {
5482 let Uses = [MXCSR], mayRaiseFPException = 1 in {
5483 let ExeDomain = SSEPackedSingle in {
5484 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5485 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5487 !strconcat(OpcodeStr,
5488 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5489 !strconcat(OpcodeStr,
5490 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5491 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5494 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5495 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5497 !strconcat(OpcodeStr,
5498 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5499 !strconcat(OpcodeStr,
5500 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5502 (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>,
5503 Sched<[sched.Folded, sched.ReadAfterFold]>;
5504 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5506 let ExeDomain = SSEPackedDouble in {
5507 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5508 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5510 !strconcat(OpcodeStr,
5511 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5512 !strconcat(OpcodeStr,
5513 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5514 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5517 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5518 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5520 !strconcat(OpcodeStr,
5521 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5522 !strconcat(OpcodeStr,
5523 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5525 (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>,
5526 Sched<[sched.Folded, sched.ReadAfterFold]>;
5527 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5531 // FP round - roundss, roundps, roundsd, roundpd
5532 let Predicates = [HasAVX, NoVLX] in {
5533 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5535 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5536 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5538 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5539 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5540 VEX, VEX_L, VEX_WIG;
5543 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5544 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5545 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5547 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5548 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5549 VEX, VEX_L, VEX_WIG;
5552 let Predicates = [UseAVX] in {
5553 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5554 v4f32, v2f64, X86RndScales, 0>,
5555 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5556 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5557 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5560 let Predicates = [UseAVX] in {
5561 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5562 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5563 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5564 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5567 let Predicates = [UseAVX, OptForSize] in {
5568 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5569 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5570 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5571 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5574 let ExeDomain = SSEPackedSingle in
5575 defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5576 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5577 let ExeDomain = SSEPackedDouble in
5578 defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5579 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5581 defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5583 let Constraints = "$src1 = $dst" in
5584 defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5585 v4f32, v2f64, X86RndScales>;
5587 let Predicates = [UseSSE41] in {
5588 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5589 (ROUNDSSr FR32:$src1, timm:$src2)>;
5590 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5591 (ROUNDSDr FR64:$src1, timm:$src2)>;
5594 let Predicates = [UseSSE41, OptForSize] in {
5595 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5596 (ROUNDSSm addr:$src1, timm:$src2)>;
5597 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5598 (ROUNDSDm addr:$src1, timm:$src2)>;
5601 //===----------------------------------------------------------------------===//
5602 // SSE4.1 - Packed Bit Test
5603 //===----------------------------------------------------------------------===//
5605 // ptest instruction we'll lower to this in X86ISelLowering primarily from
5606 // the intel intrinsic that corresponds to this.
5607 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5608 def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5609 "vptest\t{$src2, $src1|$src1, $src2}",
5610 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5611 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5612 def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5613 "vptest\t{$src2, $src1|$src1, $src2}",
5614 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5615 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5618 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5619 "vptest\t{$src2, $src1|$src1, $src2}",
5620 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5621 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5622 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5623 "vptest\t{$src2, $src1|$src1, $src2}",
5624 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5625 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5626 VEX, VEX_L, VEX_WIG;
5629 let Defs = [EFLAGS] in {
5630 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5631 "ptest\t{$src2, $src1|$src1, $src2}",
5632 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5633 Sched<[SchedWriteVecTest.XMM]>;
5634 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5635 "ptest\t{$src2, $src1|$src1, $src2}",
5636 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5637 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5640 // The bit test instructions below are AVX only
5641 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5642 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5643 X86FoldableSchedWrite sched> {
5644 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5645 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5646 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5647 Sched<[sched]>, VEX;
5648 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5649 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5650 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5651 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5654 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5655 let ExeDomain = SSEPackedSingle in {
5656 defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5657 SchedWriteFTest.XMM>;
5658 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5659 SchedWriteFTest.YMM>, VEX_L;
5661 let ExeDomain = SSEPackedDouble in {
5662 defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5663 SchedWriteFTest.XMM>;
5664 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5665 SchedWriteFTest.YMM>, VEX_L;
5669 //===----------------------------------------------------------------------===//
5670 // SSE4.1 - Misc Instructions
5671 //===----------------------------------------------------------------------===//
5673 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5674 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5675 "popcnt{w}\t{$src, $dst|$dst, $src}",
5676 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5677 Sched<[WritePOPCNT]>, OpSize16, XS;
5678 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5679 "popcnt{w}\t{$src, $dst|$dst, $src}",
5680 [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5681 (implicit EFLAGS)]>,
5682 Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5684 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5685 "popcnt{l}\t{$src, $dst|$dst, $src}",
5686 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5687 Sched<[WritePOPCNT]>, OpSize32, XS;
5689 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5690 "popcnt{l}\t{$src, $dst|$dst, $src}",
5691 [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5692 (implicit EFLAGS)]>,
5693 Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5695 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5696 "popcnt{q}\t{$src, $dst|$dst, $src}",
5697 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5698 Sched<[WritePOPCNT]>, XS;
5699 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5700 "popcnt{q}\t{$src, $dst|$dst, $src}",
5701 [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5702 (implicit EFLAGS)]>,
5703 Sched<[WritePOPCNT.Folded]>, XS;
5706 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5707 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5708 SDNode OpNode, PatFrag ld_frag,
5709 X86FoldableSchedWrite Sched> {
5710 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5712 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5713 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5715 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5717 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5719 (v8i16 (OpNode (ld_frag addr:$src))))]>,
5720 Sched<[Sched.Folded]>;
5723 // PHMIN has the same profile as PSAD, thus we use the same scheduling
5724 // model, although the naming is misleading.
5725 let Predicates = [HasAVX] in
5726 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5728 WritePHMINPOS>, VEX, VEX_WIG;
5729 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5733 /// SS48I_binop_rm - Simple SSE41 binary operator.
5734 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5735 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5736 X86MemOperand x86memop, X86FoldableSchedWrite sched,
5738 let isCommutable = 1 in
5739 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5740 (ins RC:$src1, RC:$src2),
5742 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5743 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5744 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5746 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5747 (ins RC:$src1, x86memop:$src2),
5749 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5750 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5752 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5753 Sched<[sched.Folded, sched.ReadAfterFold]>;
5756 let Predicates = [HasAVX, NoVLX] in {
5757 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5758 load, i128mem, SchedWriteVecALU.XMM, 0>,
5760 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5761 load, i128mem, SchedWriteVecALU.XMM, 0>,
5763 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5764 load, i128mem, SchedWriteVecALU.XMM, 0>,
5766 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5767 load, i128mem, SchedWriteVecALU.XMM, 0>,
5769 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5770 load, i128mem, SchedWriteVecIMul.XMM, 0>,
5773 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5774 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5775 load, i128mem, SchedWriteVecALU.XMM, 0>,
5777 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5778 load, i128mem, SchedWriteVecALU.XMM, 0>,
5780 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5781 load, i128mem, SchedWriteVecALU.XMM, 0>,
5783 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5784 load, i128mem, SchedWriteVecALU.XMM, 0>,
5788 let Predicates = [HasAVX2, NoVLX] in {
5789 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5790 load, i256mem, SchedWriteVecALU.YMM, 0>,
5791 VEX_4V, VEX_L, VEX_WIG;
5792 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5793 load, i256mem, SchedWriteVecALU.YMM, 0>,
5794 VEX_4V, VEX_L, VEX_WIG;
5795 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5796 load, i256mem, SchedWriteVecALU.YMM, 0>,
5797 VEX_4V, VEX_L, VEX_WIG;
5798 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5799 load, i256mem, SchedWriteVecALU.YMM, 0>,
5800 VEX_4V, VEX_L, VEX_WIG;
5801 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5802 load, i256mem, SchedWriteVecIMul.YMM, 0>,
5803 VEX_4V, VEX_L, VEX_WIG;
5805 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5806 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5807 load, i256mem, SchedWriteVecALU.YMM, 0>,
5808 VEX_4V, VEX_L, VEX_WIG;
5809 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5810 load, i256mem, SchedWriteVecALU.YMM, 0>,
5811 VEX_4V, VEX_L, VEX_WIG;
5812 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5813 load, i256mem, SchedWriteVecALU.YMM, 0>,
5814 VEX_4V, VEX_L, VEX_WIG;
5815 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5816 load, i256mem, SchedWriteVecALU.YMM, 0>,
5817 VEX_4V, VEX_L, VEX_WIG;
5820 let Constraints = "$src1 = $dst" in {
5821 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5822 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5823 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5824 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5825 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5826 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5827 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5828 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5829 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5830 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5831 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5832 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5833 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5834 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5835 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5836 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5837 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5838 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5841 let Predicates = [HasAVX, NoVLX] in
5842 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5843 load, i128mem, SchedWritePMULLD.XMM, 0>,
5845 let Predicates = [HasAVX] in
5846 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5847 load, i128mem, SchedWriteVecALU.XMM, 0>,
5850 let Predicates = [HasAVX2, NoVLX] in
5851 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5852 load, i256mem, SchedWritePMULLD.YMM, 0>,
5853 VEX_4V, VEX_L, VEX_WIG;
5854 let Predicates = [HasAVX2] in
5855 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5856 load, i256mem, SchedWriteVecALU.YMM, 0>,
5857 VEX_4V, VEX_L, VEX_WIG;
5859 let Constraints = "$src1 = $dst" in {
5860 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5861 memop, i128mem, SchedWritePMULLD.XMM, 1>;
5862 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5863 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5866 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5867 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5868 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5869 X86MemOperand x86memop, bit Is2Addr,
5870 X86FoldableSchedWrite sched> {
5871 let isCommutable = 1 in
5872 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5873 (ins RC:$src1, RC:$src2, u8imm:$src3),
5875 !strconcat(OpcodeStr,
5876 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5877 !strconcat(OpcodeStr,
5878 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5879 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5881 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5882 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5884 !strconcat(OpcodeStr,
5885 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5886 !strconcat(OpcodeStr,
5887 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5889 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5890 Sched<[sched.Folded, sched.ReadAfterFold]>;
5893 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5894 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5895 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5896 X86MemOperand x86memop, bit Is2Addr,
5897 X86FoldableSchedWrite sched> {
5898 let isCommutable = 1 in
5899 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5900 (ins RC:$src1, RC:$src2, u8imm:$src3),
5902 !strconcat(OpcodeStr,
5903 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5904 !strconcat(OpcodeStr,
5905 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5906 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5908 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5909 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5911 !strconcat(OpcodeStr,
5912 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5913 !strconcat(OpcodeStr,
5914 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5916 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
5917 Sched<[sched.Folded, sched.ReadAfterFold]>;
5920 def BlendCommuteImm2 : SDNodeXForm<timm, [{
5921 uint8_t Imm = N->getZExtValue() & 0x03;
5922 return getI8Imm(Imm ^ 0x03, SDLoc(N));
5925 def BlendCommuteImm4 : SDNodeXForm<timm, [{
5926 uint8_t Imm = N->getZExtValue() & 0x0f;
5927 return getI8Imm(Imm ^ 0x0f, SDLoc(N));
5930 def BlendCommuteImm8 : SDNodeXForm<timm, [{
5931 uint8_t Imm = N->getZExtValue() & 0xff;
5932 return getI8Imm(Imm ^ 0xff, SDLoc(N));
5935 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
5936 def BlendScaleImm4 : SDNodeXForm<timm, [{
5937 uint8_t Imm = N->getZExtValue();
5939 for (unsigned i = 0; i != 4; ++i) {
5941 NewImm |= 0x3 << (i * 2);
5943 return getI8Imm(NewImm, SDLoc(N));
5946 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
5947 def BlendScaleImm2 : SDNodeXForm<timm, [{
5948 uint8_t Imm = N->getZExtValue();
5950 for (unsigned i = 0; i != 2; ++i) {
5952 NewImm |= 0xf << (i * 4);
5954 return getI8Imm(NewImm, SDLoc(N));
5957 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
5958 def BlendScaleImm2to4 : SDNodeXForm<timm, [{
5959 uint8_t Imm = N->getZExtValue();
5961 for (unsigned i = 0; i != 2; ++i) {
5963 NewImm |= 0x3 << (i * 2);
5965 return getI8Imm(NewImm, SDLoc(N));
5968 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
5969 def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
5970 uint8_t Imm = N->getZExtValue();
5972 for (unsigned i = 0; i != 4; ++i) {
5974 NewImm |= 0x3 << (i * 2);
5976 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5979 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
5980 def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
5981 uint8_t Imm = N->getZExtValue();
5983 for (unsigned i = 0; i != 2; ++i) {
5985 NewImm |= 0xf << (i * 4);
5987 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5990 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
5991 def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
5992 uint8_t Imm = N->getZExtValue();
5994 for (unsigned i = 0; i != 2; ++i) {
5996 NewImm |= 0x3 << (i * 2);
5998 return getI8Imm(NewImm ^ 0xf, SDLoc(N));
6001 let Predicates = [HasAVX] in {
6002 let isCommutable = 0 in {
6003 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6004 VR128, load, i128mem, 0,
6005 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6008 let Uses = [MXCSR], mayRaiseFPException = 1 in {
6009 let ExeDomain = SSEPackedSingle in
6010 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6011 VR128, load, f128mem, 0,
6012 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6013 let ExeDomain = SSEPackedDouble in
6014 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6015 VR128, load, f128mem, 0,
6016 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6017 let ExeDomain = SSEPackedSingle in
6018 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6019 VR256, load, i256mem, 0,
6020 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6024 let Predicates = [HasAVX2] in {
6025 let isCommutable = 0 in {
6026 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6027 VR256, load, i256mem, 0,
6028 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6032 let Constraints = "$src1 = $dst" in {
6033 let isCommutable = 0 in {
6034 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6035 VR128, memop, i128mem, 1,
6036 SchedWriteMPSAD.XMM>;
6039 let ExeDomain = SSEPackedSingle in
6040 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6041 VR128, memop, f128mem, 1,
6042 SchedWriteDPPS.XMM>, SIMD_EXC;
6043 let ExeDomain = SSEPackedDouble in
6044 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6045 VR128, memop, f128mem, 1,
6046 SchedWriteDPPD.XMM>, SIMD_EXC;
6049 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6050 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6051 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6052 X86MemOperand x86memop, bit Is2Addr, Domain d,
6053 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6054 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6055 let isCommutable = 1 in
6056 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6057 (ins RC:$src1, RC:$src2, u8imm:$src3),
6059 !strconcat(OpcodeStr,
6060 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6061 !strconcat(OpcodeStr,
6062 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6063 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6065 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6066 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6068 !strconcat(OpcodeStr,
6069 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6070 !strconcat(OpcodeStr,
6071 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6073 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6074 Sched<[sched.Folded, sched.ReadAfterFold]>;
6077 // Pattern to commute if load is in first source.
6078 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6079 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6080 (commuteXForm timm:$src3))>;
6083 let Predicates = [HasAVX] in {
6084 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6085 VR128, load, f128mem, 0, SSEPackedSingle,
6086 SchedWriteFBlend.XMM, BlendCommuteImm4>,
6088 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6089 VR256, load, f256mem, 0, SSEPackedSingle,
6090 SchedWriteFBlend.YMM, BlendCommuteImm8>,
6091 VEX_4V, VEX_L, VEX_WIG;
6092 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6093 VR128, load, f128mem, 0, SSEPackedDouble,
6094 SchedWriteFBlend.XMM, BlendCommuteImm2>,
6096 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6097 VR256, load, f256mem, 0, SSEPackedDouble,
6098 SchedWriteFBlend.YMM, BlendCommuteImm4>,
6099 VEX_4V, VEX_L, VEX_WIG;
6100 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6101 VR128, load, i128mem, 0, SSEPackedInt,
6102 SchedWriteBlend.XMM, BlendCommuteImm8>,
6106 let Predicates = [HasAVX2] in {
6107 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6108 VR256, load, i256mem, 0, SSEPackedInt,
6109 SchedWriteBlend.YMM, BlendCommuteImm8>,
6110 VEX_4V, VEX_L, VEX_WIG;
6113 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6114 // ExecutionDomainFixPass will cleanup domains later on.
6115 let Predicates = [HasAVX1Only] in {
6116 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6117 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6118 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6119 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6120 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6121 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6123 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6124 // it from becoming movsd via commuting under optsize.
6125 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6126 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6127 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6128 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6129 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6130 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6132 def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6133 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6134 def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6135 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6136 def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6137 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6139 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6140 // it from becoming movss via commuting under optsize.
6141 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6142 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6143 def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6144 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6145 def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6146 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6149 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6150 VR128, memop, f128mem, 1, SSEPackedSingle,
6151 SchedWriteFBlend.XMM, BlendCommuteImm4>;
6152 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6153 VR128, memop, f128mem, 1, SSEPackedDouble,
6154 SchedWriteFBlend.XMM, BlendCommuteImm2>;
6155 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6156 VR128, memop, i128mem, 1, SSEPackedInt,
6157 SchedWriteBlend.XMM, BlendCommuteImm8>;
6159 let Predicates = [UseSSE41] in {
6160 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6161 // it from becoming movss via commuting under optsize.
6162 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6163 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6164 def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6165 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6166 def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6167 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6169 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6170 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6171 def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6172 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6173 def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6174 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6177 // For insertion into the zero index (low half) of a 256-bit vector, it is
6178 // more efficient to generate a blend with immediate instead of an insert*128.
6179 let Predicates = [HasAVX] in {
6180 def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6181 (VBLENDPDYrri VR256:$src1,
6182 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6183 VR128:$src2, sub_xmm), 0x3)>;
6184 def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6185 (VBLENDPSYrri VR256:$src1,
6186 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6187 VR128:$src2, sub_xmm), 0xf)>;
6189 def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6190 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6191 VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6192 def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6193 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6194 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6197 /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6198 multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6199 X86MemOperand x86memop, ValueType VT,
6200 PatFrag mem_frag, SDNode OpNode,
6201 X86FoldableSchedWrite sched> {
6202 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6203 (ins RC:$src1, RC:$src2, RC:$src3),
6204 !strconcat(OpcodeStr,
6205 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6206 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6207 SSEPackedInt>, TAPD, VEX_4V,
6210 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6211 (ins RC:$src1, x86memop:$src2, RC:$src3),
6212 !strconcat(OpcodeStr,
6213 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6215 (OpNode RC:$src3, (mem_frag addr:$src2),
6216 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6217 Sched<[sched.Folded, sched.ReadAfterFold,
6219 ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6222 sched.ReadAfterFold]>;
6225 let Predicates = [HasAVX] in {
6226 let ExeDomain = SSEPackedDouble in {
6227 defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6228 v2f64, loadv2f64, X86Blendv,
6229 SchedWriteFVarBlend.XMM>;
6230 defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6231 v4f64, loadv4f64, X86Blendv,
6232 SchedWriteFVarBlend.YMM>, VEX_L;
6233 } // ExeDomain = SSEPackedDouble
6234 let ExeDomain = SSEPackedSingle in {
6235 defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6236 v4f32, loadv4f32, X86Blendv,
6237 SchedWriteFVarBlend.XMM>;
6238 defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6239 v8f32, loadv8f32, X86Blendv,
6240 SchedWriteFVarBlend.YMM>, VEX_L;
6241 } // ExeDomain = SSEPackedSingle
6242 defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6243 v16i8, loadv16i8, X86Blendv,
6244 SchedWriteVarBlend.XMM>;
6247 let Predicates = [HasAVX2] in {
6248 defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6249 v32i8, loadv32i8, X86Blendv,
6250 SchedWriteVarBlend.YMM>, VEX_L;
6253 let Predicates = [HasAVX] in {
6254 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6255 (v4i32 VR128:$src2))),
6256 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6257 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6258 (v2i64 VR128:$src2))),
6259 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6260 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6261 (v8i32 VR256:$src2))),
6262 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6263 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6264 (v4i64 VR256:$src2))),
6265 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6268 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6269 // changed to use blends because blends have better throughput on sandybridge
6270 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6271 let Predicates = [HasAVX, OptForSpeed] in {
6272 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6273 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6274 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6275 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6277 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6278 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6279 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6280 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6281 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6282 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6284 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6285 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6286 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6287 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6288 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6289 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6291 // Move low f32 and clear high bits.
6292 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6293 (SUBREG_TO_REG (i32 0),
6294 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6295 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6296 (i8 1))), sub_xmm)>;
6297 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6298 (SUBREG_TO_REG (i32 0),
6299 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6300 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6301 (i8 3))), sub_xmm)>;
6304 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6305 // changed to use blends because blends have better throughput on sandybridge
6306 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6307 let Predicates = [UseSSE41, OptForSpeed] in {
6308 // With SSE41 we can use blends for these patterns.
6309 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6310 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6311 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6312 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6314 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6315 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6316 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6317 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6318 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6319 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6321 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6322 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6323 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6324 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6325 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6326 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6330 /// SS41I_ternary - SSE 4.1 ternary operator
6331 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6332 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6333 PatFrag mem_frag, X86MemOperand x86memop,
6334 SDNode OpNode, X86FoldableSchedWrite sched> {
6335 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6336 (ins VR128:$src1, VR128:$src2),
6337 !strconcat(OpcodeStr,
6338 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6340 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6343 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6344 (ins VR128:$src1, x86memop:$src2),
6345 !strconcat(OpcodeStr,
6346 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6348 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6349 Sched<[sched.Folded, sched.ReadAfterFold]>;
6353 let ExeDomain = SSEPackedDouble in
6354 defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6355 X86Blendv, SchedWriteFVarBlend.XMM>;
6356 let ExeDomain = SSEPackedSingle in
6357 defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6358 X86Blendv, SchedWriteFVarBlend.XMM>;
6359 defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6360 X86Blendv, SchedWriteVarBlend.XMM>;
6362 // Aliases with the implicit xmm0 argument
6363 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6364 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6365 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6366 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6367 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6368 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6369 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6370 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6371 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6372 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6373 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6374 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6376 let Predicates = [UseSSE41] in {
6377 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6378 (v4i32 VR128:$src2))),
6379 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6380 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6381 (v2i64 VR128:$src2))),
6382 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6385 let AddedComplexity = 400 in { // Prefer non-temporal versions
6387 let Predicates = [HasAVX, NoVLX] in
6388 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6389 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6390 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6391 let Predicates = [HasAVX2, NoVLX] in
6392 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6393 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6394 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6395 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6396 "movntdqa\t{$src, $dst|$dst, $src}", []>,
6397 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6399 let Predicates = [HasAVX2, NoVLX] in {
6400 def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6401 (VMOVNTDQAYrm addr:$src)>;
6402 def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6403 (VMOVNTDQAYrm addr:$src)>;
6404 def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6405 (VMOVNTDQAYrm addr:$src)>;
6406 def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6407 (VMOVNTDQAYrm addr:$src)>;
6408 def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6409 (VMOVNTDQAYrm addr:$src)>;
6410 def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6411 (VMOVNTDQAYrm addr:$src)>;
6414 let Predicates = [HasAVX, NoVLX] in {
6415 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6416 (VMOVNTDQArm addr:$src)>;
6417 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6418 (VMOVNTDQArm addr:$src)>;
6419 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6420 (VMOVNTDQArm addr:$src)>;
6421 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6422 (VMOVNTDQArm addr:$src)>;
6423 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6424 (VMOVNTDQArm addr:$src)>;
6425 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6426 (VMOVNTDQArm addr:$src)>;
6429 let Predicates = [UseSSE41] in {
6430 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6431 (MOVNTDQArm addr:$src)>;
6432 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6433 (MOVNTDQArm addr:$src)>;
6434 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6435 (MOVNTDQArm addr:$src)>;
6436 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6437 (MOVNTDQArm addr:$src)>;
6438 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6439 (MOVNTDQArm addr:$src)>;
6440 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6441 (MOVNTDQArm addr:$src)>;
6444 } // AddedComplexity
6446 //===----------------------------------------------------------------------===//
6447 // SSE4.2 - Compare Instructions
6448 //===----------------------------------------------------------------------===//
6450 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
6451 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6452 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6453 X86MemOperand x86memop, X86FoldableSchedWrite sched,
6455 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6456 (ins RC:$src1, RC:$src2),
6458 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6459 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6460 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6462 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6463 (ins RC:$src1, x86memop:$src2),
6465 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6466 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6468 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6469 Sched<[sched.Folded, sched.ReadAfterFold]>;
6472 let Predicates = [HasAVX] in
6473 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6474 load, i128mem, SchedWriteVecALU.XMM, 0>,
6477 let Predicates = [HasAVX2] in
6478 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6479 load, i256mem, SchedWriteVecALU.YMM, 0>,
6480 VEX_4V, VEX_L, VEX_WIG;
6482 let Constraints = "$src1 = $dst" in
6483 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6484 memop, i128mem, SchedWriteVecALU.XMM>;
6486 //===----------------------------------------------------------------------===//
6487 // SSE4.2 - String/text Processing Instructions
6488 //===----------------------------------------------------------------------===//
6490 multiclass pcmpistrm_SS42AI<string asm> {
6491 def rr : SS42AI<0x62, MRMSrcReg, (outs),
6492 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6493 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6494 []>, Sched<[WritePCmpIStrM]>;
6496 def rm :SS42AI<0x62, MRMSrcMem, (outs),
6497 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6498 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6499 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6502 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6503 let Predicates = [HasAVX] in
6504 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6505 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
6508 multiclass SS42AI_pcmpestrm<string asm> {
6509 def rr : SS42AI<0x60, MRMSrcReg, (outs),
6510 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6511 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6512 []>, Sched<[WritePCmpEStrM]>;
6514 def rm : SS42AI<0x60, MRMSrcMem, (outs),
6515 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6516 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6517 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6520 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6521 let Predicates = [HasAVX] in
6522 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6523 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
6526 multiclass SS42AI_pcmpistri<string asm> {
6527 def rr : SS42AI<0x63, MRMSrcReg, (outs),
6528 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6529 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6530 []>, Sched<[WritePCmpIStrI]>;
6532 def rm : SS42AI<0x63, MRMSrcMem, (outs),
6533 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6534 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6535 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6538 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6539 let Predicates = [HasAVX] in
6540 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6541 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
6544 multiclass SS42AI_pcmpestri<string asm> {
6545 def rr : SS42AI<0x61, MRMSrcReg, (outs),
6546 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6547 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6548 []>, Sched<[WritePCmpEStrI]>;
6550 def rm : SS42AI<0x61, MRMSrcMem, (outs),
6551 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6552 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6553 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6556 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6557 let Predicates = [HasAVX] in
6558 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6559 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
6562 //===----------------------------------------------------------------------===//
6563 // SSE4.2 - CRC Instructions
6564 //===----------------------------------------------------------------------===//
6566 // No CRC instructions have AVX equivalents
6568 // crc intrinsic instruction
6569 // This set of instructions are only rm, the only difference is the size
6571 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6572 RegisterClass RCIn, SDPatternOperator Int> :
6573 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6574 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6575 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6576 Sched<[WriteCRC32]>;
6578 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6579 X86MemOperand x86memop, SDPatternOperator Int> :
6580 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6581 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6582 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6583 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6585 let Constraints = "$src1 = $dst" in {
6586 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6587 int_x86_sse42_crc32_32_8>;
6588 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6589 int_x86_sse42_crc32_32_8>;
6590 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6591 int_x86_sse42_crc32_32_16>, OpSize16;
6592 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6593 int_x86_sse42_crc32_32_16>, OpSize16;
6594 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6595 int_x86_sse42_crc32_32_32>, OpSize32;
6596 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6597 int_x86_sse42_crc32_32_32>, OpSize32;
6598 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6599 int_x86_sse42_crc32_64_64>, REX_W;
6600 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6601 int_x86_sse42_crc32_64_64>, REX_W;
6602 let hasSideEffects = 0 in {
6604 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6606 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6611 //===----------------------------------------------------------------------===//
6612 // SHA-NI Instructions
6613 //===----------------------------------------------------------------------===//
6615 // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6616 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6617 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6618 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6619 (ins VR128:$src1, VR128:$src2),
6621 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6622 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6624 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6625 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6628 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6629 (ins VR128:$src1, i128mem:$src2),
6631 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6632 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6634 (set VR128:$dst, (IntId VR128:$src1,
6635 (memop addr:$src2), XMM0)),
6636 (set VR128:$dst, (IntId VR128:$src1,
6637 (memop addr:$src2))))]>, T8,
6638 Sched<[sched.Folded, sched.ReadAfterFold]>;
6641 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6642 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6643 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6644 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6646 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6647 (i8 timm:$src3)))]>, TA,
6648 Sched<[SchedWriteVecIMul.XMM]>;
6649 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6650 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6651 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6653 (int_x86_sha1rnds4 VR128:$src1,
6655 (i8 timm:$src3)))]>, TA,
6656 Sched<[SchedWriteVecIMul.XMM.Folded,
6657 SchedWriteVecIMul.XMM.ReadAfterFold]>;
6659 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6660 SchedWriteVecIMul.XMM>;
6661 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6662 SchedWriteVecIMul.XMM>;
6663 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6664 SchedWriteVecIMul.XMM>;
6667 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6668 SchedWriteVecIMul.XMM, 1>;
6670 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6671 SchedWriteVecIMul.XMM>;
6672 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6673 SchedWriteVecIMul.XMM>;
6676 // Aliases with explicit %xmm0
6677 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6678 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6679 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6680 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6682 //===----------------------------------------------------------------------===//
6683 // AES-NI Instructions
6684 //===----------------------------------------------------------------------===//
6686 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6687 Intrinsic IntId, PatFrag ld_frag,
6688 bit Is2Addr = 0, RegisterClass RC = VR128,
6689 X86MemOperand MemOp = i128mem> {
6690 let AsmString = OpcodeStr##
6691 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6692 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6693 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6694 (ins RC:$src1, RC:$src2), "",
6695 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6696 Sched<[WriteAESDecEnc]>;
6697 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6698 (ins RC:$src1, MemOp:$src2), "",
6699 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6700 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6704 // Perform One Round of an AES Encryption/Decryption Flow
6705 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6706 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
6707 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6708 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
6709 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6710 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
6711 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6712 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
6713 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6716 let Predicates = [NoVLX, HasVAES] in {
6717 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
6718 int_x86_aesni_aesenc_256, load, 0, VR256,
6719 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6720 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
6721 int_x86_aesni_aesenclast_256, load, 0, VR256,
6722 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6723 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
6724 int_x86_aesni_aesdec_256, load, 0, VR256,
6725 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6726 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
6727 int_x86_aesni_aesdeclast_256, load, 0, VR256,
6728 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6731 let Constraints = "$src1 = $dst" in {
6732 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
6733 int_x86_aesni_aesenc, memop, 1>;
6734 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
6735 int_x86_aesni_aesenclast, memop, 1>;
6736 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
6737 int_x86_aesni_aesdec, memop, 1>;
6738 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
6739 int_x86_aesni_aesdeclast, memop, 1>;
6742 // Perform the AES InvMixColumn Transformation
6743 let Predicates = [HasAVX, HasAES] in {
6744 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6746 "vaesimc\t{$src1, $dst|$dst, $src1}",
6748 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6750 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6751 (ins i128mem:$src1),
6752 "vaesimc\t{$src1, $dst|$dst, $src1}",
6753 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6754 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6756 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6758 "aesimc\t{$src1, $dst|$dst, $src1}",
6760 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6761 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6762 (ins i128mem:$src1),
6763 "aesimc\t{$src1, $dst|$dst, $src1}",
6764 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6765 Sched<[WriteAESIMC.Folded]>;
6767 // AES Round Key Generation Assist
6768 let Predicates = [HasAVX, HasAES] in {
6769 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6770 (ins VR128:$src1, u8imm:$src2),
6771 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6773 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6774 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6775 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6776 (ins i128mem:$src1, u8imm:$src2),
6777 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6779 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6780 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6782 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6783 (ins VR128:$src1, u8imm:$src2),
6784 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6786 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6787 Sched<[WriteAESKeyGen]>;
6788 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6789 (ins i128mem:$src1, u8imm:$src2),
6790 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6792 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6793 Sched<[WriteAESKeyGen.Folded]>;
6795 //===----------------------------------------------------------------------===//
6796 // PCLMUL Instructions
6797 //===----------------------------------------------------------------------===//
6799 // Immediate transform to help with commuting.
6800 def PCLMULCommuteImm : SDNodeXForm<timm, [{
6801 uint8_t Imm = N->getZExtValue();
6802 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6805 // SSE carry-less Multiplication instructions
6806 let Predicates = [NoAVX, HasPCLMUL] in {
6807 let Constraints = "$src1 = $dst" in {
6808 let isCommutable = 1 in
6809 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6810 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6811 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6813 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6814 Sched<[WriteCLMul]>;
6816 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6817 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6818 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6820 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6822 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6823 } // Constraints = "$src1 = $dst"
6825 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6827 (PCLMULQDQrm VR128:$src1, addr:$src2,
6828 (PCLMULCommuteImm timm:$src3))>;
6829 } // Predicates = [NoAVX, HasPCLMUL]
6832 foreach HI = ["hq","lq"] in
6833 foreach LO = ["hq","lq"] in {
6834 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6835 (PCLMULQDQrr VR128:$dst, VR128:$src,
6836 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6837 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6838 (PCLMULQDQrm VR128:$dst, i128mem:$src,
6839 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6842 // AVX carry-less Multiplication instructions
6843 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6844 PatFrag LdFrag, Intrinsic IntId> {
6845 let isCommutable = 1 in
6846 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6847 (ins RC:$src1, RC:$src2, u8imm:$src3),
6848 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6850 (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6851 Sched<[WriteCLMul]>;
6853 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6854 (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6855 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6857 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6858 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6860 // We can commute a load in the first operand by swapping the sources and
6861 // rotating the immediate.
6862 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6863 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6864 (PCLMULCommuteImm timm:$src3))>;
6867 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6868 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6869 int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6871 let Predicates = [NoVLX, HasVPCLMULQDQ] in
6872 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6873 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6875 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6876 X86MemOperand MemOp, string Hi, string Lo> {
6877 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6878 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6879 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6880 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6881 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6882 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6885 multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6886 X86MemOperand MemOp> {
6887 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6888 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6889 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6890 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6894 defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6895 defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6897 //===----------------------------------------------------------------------===//
6898 // SSE4A Instructions
6899 //===----------------------------------------------------------------------===//
6901 let Predicates = [HasSSE4A] in {
6903 let ExeDomain = SSEPackedInt in {
6904 let Constraints = "$src = $dst" in {
6905 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6906 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6907 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6908 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
6910 PD, Sched<[SchedWriteVecALU.XMM]>;
6911 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
6912 (ins VR128:$src, VR128:$mask),
6913 "extrq\t{$mask, $src|$src, $mask}",
6914 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6916 PD, Sched<[SchedWriteVecALU.XMM]>;
6918 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
6919 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
6920 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
6921 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
6922 timm:$len, timm:$idx))]>,
6923 XD, Sched<[SchedWriteVecALU.XMM]>;
6924 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
6925 (ins VR128:$src, VR128:$mask),
6926 "insertq\t{$mask, $src|$src, $mask}",
6927 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
6929 XD, Sched<[SchedWriteVecALU.XMM]>;
6931 } // ExeDomain = SSEPackedInt
6933 // Non-temporal (unaligned) scalar stores.
6934 let AddedComplexity = 400 in { // Prefer non-temporal versions
6935 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
6936 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
6937 "movntss\t{$src, $dst|$dst, $src}", []>, XS;
6939 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
6940 "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
6943 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
6944 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
6946 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
6947 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
6949 } // AddedComplexity
6952 //===----------------------------------------------------------------------===//
6954 //===----------------------------------------------------------------------===//
6956 //===----------------------------------------------------------------------===//
6957 // VBROADCAST - Load from memory and broadcast to all elements of the
6958 // destination operand
6960 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
6961 X86MemOperand x86memop, ValueType VT,
6962 PatFrag bcast_frag, SchedWrite Sched> :
6963 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
6964 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6965 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
6966 Sched<[Sched]>, VEX;
6968 // AVX2 adds register forms
6969 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
6970 ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
6971 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
6972 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6973 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
6974 Sched<[Sched]>, VEX;
6976 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
6977 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
6978 f32mem, v4f32, X86VBroadcastld32,
6979 SchedWriteFShuffle.XMM.Folded>;
6980 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
6981 f32mem, v8f32, X86VBroadcastld32,
6982 SchedWriteFShuffle.XMM.Folded>, VEX_L;
6984 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
6985 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
6986 v4f64, X86VBroadcastld64,
6987 SchedWriteFShuffle.XMM.Folded>, VEX_L;
6989 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
6990 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
6991 v4f32, v4f32, SchedWriteFShuffle.XMM>;
6992 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
6993 v8f32, v4f32, WriteFShuffle256>, VEX_L;
6995 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
6996 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
6997 v4f64, v2f64, WriteFShuffle256>, VEX_L;
6999 //===----------------------------------------------------------------------===//
7000 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7001 // halves of a 256-bit vector.
7003 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7004 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7006 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7007 Sched<[WriteShuffleLd]>, VEX, VEX_L;
7009 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7010 ExeDomain = SSEPackedSingle in
7011 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7013 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7014 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7016 let Predicates = [HasAVX, NoVLX] in {
7017 def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
7018 (VBROADCASTF128 addr:$src)>;
7019 def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
7020 (VBROADCASTF128 addr:$src)>;
7023 // NOTE: We're using FP instructions here, but execution domain fixing can
7024 // convert to integer when profitable.
7025 let Predicates = [HasAVX, NoVLX] in {
7026 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7027 (VBROADCASTF128 addr:$src)>;
7028 def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
7029 (VBROADCASTF128 addr:$src)>;
7030 def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
7031 (VBROADCASTF128 addr:$src)>;
7032 def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
7033 (VBROADCASTF128 addr:$src)>;
7036 //===----------------------------------------------------------------------===//
7037 // VINSERTF128 - Insert packed floating-point values
7039 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7040 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7041 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7042 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7043 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7045 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7046 (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7047 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7048 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7051 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
7052 // with YMM register containing zero.
7053 // FIXME: Avoid producing vxorps to clear the fake inputs.
7054 let Predicates = [HasAVX1Only] in {
7055 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7058 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7059 PatFrag memop_frag> {
7060 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7062 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7063 (INSERT_get_vinsert128_imm VR256:$ins))>;
7064 def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7065 (From (memop_frag addr:$src2)),
7067 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7068 (INSERT_get_vinsert128_imm VR256:$ins))>;
7071 let Predicates = [HasAVX, NoVLX] in {
7072 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7073 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7076 let Predicates = [HasAVX1Only] in {
7077 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
7078 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>;
7079 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
7080 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>;
7083 //===----------------------------------------------------------------------===//
7084 // VEXTRACTF128 - Extract packed floating-point values
7086 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7087 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7088 (ins VR256:$src1, u8imm:$src2),
7089 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7090 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7092 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7093 (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7094 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7095 []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7098 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7099 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7100 (To (!cast<Instruction>(InstrStr#rr)
7102 (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7103 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7104 (iPTR imm))), addr:$dst),
7105 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7106 (EXTRACT_get_vextract128_imm VR128:$ext))>;
7110 let Predicates = [HasAVX, NoVLX] in {
7111 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7112 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7115 let Predicates = [HasAVX1Only] in {
7116 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
7117 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
7118 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7119 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
7122 //===----------------------------------------------------------------------===//
7123 // VMASKMOV - Conditional SIMD Packed Loads and Stores
7125 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7126 Intrinsic IntLd, Intrinsic IntLd256,
7127 Intrinsic IntSt, Intrinsic IntSt256,
7128 X86SchedWriteMaskMove schedX,
7129 X86SchedWriteMaskMove schedY> {
7130 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7131 (ins VR128:$src1, f128mem:$src2),
7132 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7133 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7134 VEX_4V, Sched<[schedX.RM]>;
7135 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7136 (ins VR256:$src1, f256mem:$src2),
7137 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7138 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7139 VEX_4V, VEX_L, Sched<[schedY.RM]>;
7140 def mr : AVX8I<opc_mr, MRMDestMem, (outs),
7141 (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7142 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7143 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7144 VEX_4V, Sched<[schedX.MR]>;
7145 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7146 (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7147 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7148 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7149 VEX_4V, VEX_L, Sched<[schedY.MR]>;
7152 let ExeDomain = SSEPackedSingle in
7153 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7154 int_x86_avx_maskload_ps,
7155 int_x86_avx_maskload_ps_256,
7156 int_x86_avx_maskstore_ps,
7157 int_x86_avx_maskstore_ps_256,
7158 WriteFMaskMove32, WriteFMaskMove32Y>;
7159 let ExeDomain = SSEPackedDouble in
7160 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7161 int_x86_avx_maskload_pd,
7162 int_x86_avx_maskload_pd_256,
7163 int_x86_avx_maskstore_pd,
7164 int_x86_avx_maskstore_pd_256,
7165 WriteFMaskMove64, WriteFMaskMove64Y>;
7167 //===----------------------------------------------------------------------===//
7168 // VPERMIL - Permute Single and Double Floating-Point Values
7171 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7172 RegisterClass RC, X86MemOperand x86memop_f,
7173 X86MemOperand x86memop_i,
7174 ValueType f_vt, ValueType i_vt,
7175 X86FoldableSchedWrite sched,
7176 X86FoldableSchedWrite varsched> {
7177 let Predicates = [HasAVX, NoVLX] in {
7178 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7179 (ins RC:$src1, RC:$src2),
7180 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7181 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7183 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7184 (ins RC:$src1, x86memop_i:$src2),
7185 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7186 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7187 (i_vt (load addr:$src2)))))]>, VEX_4V,
7188 Sched<[varsched.Folded, sched.ReadAfterFold]>;
7190 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7191 (ins RC:$src1, u8imm:$src2),
7192 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7193 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7195 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7196 (ins x86memop_f:$src1, u8imm:$src2),
7197 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7199 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7200 Sched<[sched.Folded]>;
7201 }// Predicates = [HasAVX, NoVLX]
7204 let ExeDomain = SSEPackedSingle in {
7205 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7206 v4f32, v4i32, SchedWriteFShuffle.XMM,
7207 SchedWriteFVarShuffle.XMM>;
7208 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7209 v8f32, v8i32, SchedWriteFShuffle.YMM,
7210 SchedWriteFVarShuffle.YMM>, VEX_L;
7212 let ExeDomain = SSEPackedDouble in {
7213 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7214 v2f64, v2i64, SchedWriteFShuffle.XMM,
7215 SchedWriteFVarShuffle.XMM>;
7216 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7217 v4f64, v4i64, SchedWriteFShuffle.YMM,
7218 SchedWriteFVarShuffle.YMM>, VEX_L;
7221 //===----------------------------------------------------------------------===//
7222 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7225 let ExeDomain = SSEPackedSingle in {
7226 let isCommutable = 1 in
7227 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7228 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7229 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7230 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7231 (i8 timm:$src3))))]>, VEX_4V, VEX_L,
7232 Sched<[WriteFShuffle256]>;
7233 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7234 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7235 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7236 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7237 (i8 timm:$src3)))]>, VEX_4V, VEX_L,
7238 Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7241 // Immediate transform to help with commuting.
7242 def Perm2XCommuteImm : SDNodeXForm<timm, [{
7243 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7246 let Predicates = [HasAVX] in {
7247 // Pattern with load in other operand.
7248 def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7249 VR256:$src1, (i8 timm:$imm))),
7250 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7253 let Predicates = [HasAVX1Only] in {
7254 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7255 (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
7256 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7257 (loadv4i64 addr:$src2), (i8 timm:$imm))),
7258 (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
7259 // Pattern with load in other operand.
7260 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7261 VR256:$src1, (i8 timm:$imm))),
7262 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7265 //===----------------------------------------------------------------------===//
7266 // VZERO - Zero YMM registers
7267 // Note: These instruction do not affect the YMM16-YMM31.
7270 let SchedRW = [WriteSystem] in {
7271 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7272 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7273 // Zero All YMM registers
7274 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7275 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7276 Requires<[HasAVX]>, VEX_WIG;
7278 // Zero Upper bits of YMM registers
7279 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7280 [(int_x86_avx_vzeroupper)]>, PS, VEX,
7281 Requires<[HasAVX]>, VEX_WIG;
7285 //===----------------------------------------------------------------------===//
7286 // Half precision conversion instructions
7289 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7290 X86FoldableSchedWrite sched> {
7291 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7292 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7293 [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
7294 T8PD, VEX, Sched<[sched]>;
7295 let hasSideEffects = 0, mayLoad = 1 in
7296 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7297 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7298 [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
7299 T8PD, VEX, Sched<[sched.Folded]>;
7302 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7303 SchedWrite RR, SchedWrite MR> {
7304 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7305 (ins RC:$src1, i32u8imm:$src2),
7306 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7307 [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>,
7308 TAPD, VEX, Sched<[RR]>;
7309 let hasSideEffects = 0, mayStore = 1 in
7310 def mr : Ii8<0x1D, MRMDestMem, (outs),
7311 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7312 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7313 TAPD, VEX, Sched<[MR]>;
7316 let Predicates = [HasF16C, NoVLX] in {
7317 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7318 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7319 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7320 WriteCvtPS2PHSt>, SIMD_EXC;
7321 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7322 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7324 // Pattern match vcvtph2ps of a scalar i64 load.
7325 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7326 (VCVTPH2PSrm addr:$src)>;
7327 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
7328 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7329 (VCVTPH2PSrm addr:$src)>;
7331 def : Pat<(store (f64 (extractelt
7332 (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
7333 (iPTR 0))), addr:$dst),
7334 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7335 def : Pat<(store (i64 (extractelt
7336 (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
7337 (iPTR 0))), addr:$dst),
7338 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7339 def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7340 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7343 // Patterns for matching conversions from float to half-float and vice versa.
7344 let Predicates = [HasF16C, NoVLX] in {
7345 // Use MXCSR.RC for rounding instead of explicitly specifying the default
7346 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
7347 // configurations we support (the default). However, falling back to MXCSR is
7348 // more consistent with other instructions, which are always controlled by it.
7349 // It's encoded as 0b100.
7350 def : Pat<(fp_to_f16 FR32:$src),
7351 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
7352 (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
7354 def : Pat<(f16_to_fp GR16:$src),
7355 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7356 (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
7358 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
7359 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7360 (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
7363 //===----------------------------------------------------------------------===//
7364 // AVX2 Instructions
7365 //===----------------------------------------------------------------------===//
7367 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7368 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7369 ValueType OpVT, X86FoldableSchedWrite sched,
7371 X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7372 let isCommutable = 1 in
7373 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7374 (ins RC:$src1, RC:$src2, u8imm:$src3),
7375 !strconcat(OpcodeStr,
7376 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7377 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7378 Sched<[sched]>, VEX_4V;
7379 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7380 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7381 !strconcat(OpcodeStr,
7382 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7384 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7385 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7387 // Pattern to commute if load is in first source.
7388 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7389 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7390 (commuteXForm timm:$src3))>;
7393 let Predicates = [HasAVX2] in {
7394 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7395 SchedWriteBlend.XMM, VR128, i128mem,
7397 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7398 SchedWriteBlend.YMM, VR256, i256mem,
7399 BlendCommuteImm8>, VEX_L;
7401 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7402 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7403 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7404 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7405 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7406 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7408 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7409 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7410 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7411 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7412 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7413 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7416 // For insertion into the zero index (low half) of a 256-bit vector, it is
7417 // more efficient to generate a blend with immediate instead of an insert*128.
7418 // NOTE: We're using FP instructions here, but exeuction domain fixing should
7419 // take care of using integer instructions when profitable.
7420 let Predicates = [HasAVX] in {
7421 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7422 (VBLENDPSYrri VR256:$src1,
7423 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7424 VR128:$src2, sub_xmm), 0xf)>;
7425 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7426 (VBLENDPSYrri VR256:$src1,
7427 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7428 VR128:$src2, sub_xmm), 0xf)>;
7429 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7430 (VBLENDPSYrri VR256:$src1,
7431 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7432 VR128:$src2, sub_xmm), 0xf)>;
7433 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7434 (VBLENDPSYrri VR256:$src1,
7435 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7436 VR128:$src2, sub_xmm), 0xf)>;
7438 def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7439 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7440 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7441 def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7442 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7443 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7444 def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7445 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7446 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7447 def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7448 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7449 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7452 //===----------------------------------------------------------------------===//
7453 // VPBROADCAST - Load from memory and broadcast to all elements of the
7454 // destination operand
7456 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7457 X86MemOperand x86memop, PatFrag bcast_frag,
7458 ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7459 let Predicates = [HasAVX2, prd] in {
7460 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7461 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7463 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7464 Sched<[SchedWriteShuffle.XMM]>, VEX;
7465 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7466 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7468 (OpVT128 (bcast_frag addr:$src)))]>,
7469 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7470 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7471 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7473 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7474 Sched<[WriteShuffle256]>, VEX, VEX_L;
7475 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7476 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7478 (OpVT256 (bcast_frag addr:$src)))]>,
7479 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7481 // Provide aliases for broadcast from the same register class that
7482 // automatically does the extract.
7483 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7484 (!cast<Instruction>(NAME#"Yrr")
7485 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7489 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7490 v16i8, v32i8, NoVLX_Or_NoBWI>;
7491 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7492 v8i16, v16i16, NoVLX_Or_NoBWI>;
7493 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7494 v4i32, v8i32, NoVLX>;
7495 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7496 v2i64, v4i64, NoVLX>;
7498 let Predicates = [HasAVX2, NoVLX] in {
7499 // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
7500 def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7501 (VPBROADCASTQrm addr:$src)>;
7502 def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7503 (VPBROADCASTQYrm addr:$src)>;
7505 // FIXME this is to handle aligned extloads from i8/i16.
7506 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
7507 (VPBROADCASTDrm addr:$src)>;
7508 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
7509 (VPBROADCASTDYrm addr:$src)>;
7511 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7512 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
7513 // This means we'll encounter truncated i32 loads; match that here.
7514 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7515 (VPBROADCASTWrm addr:$src)>;
7516 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7517 (VPBROADCASTWYrm addr:$src)>;
7518 def : Pat<(v8i16 (X86VBroadcast
7519 (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7520 (VPBROADCASTWrm addr:$src)>;
7521 def : Pat<(v8i16 (X86VBroadcast
7522 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7523 (VPBROADCASTWrm addr:$src)>;
7524 def : Pat<(v16i16 (X86VBroadcast
7525 (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7526 (VPBROADCASTWYrm addr:$src)>;
7527 def : Pat<(v16i16 (X86VBroadcast
7528 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7529 (VPBROADCASTWYrm addr:$src)>;
7531 // FIXME this is to handle aligned extloads from i8.
7532 def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
7533 (VPBROADCASTWrm addr:$src)>;
7534 def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
7535 (VPBROADCASTWYrm addr:$src)>;
7538 let Predicates = [HasAVX2, NoVLX] in {
7539 // Provide fallback in case the load node that is used in the patterns above
7540 // is used by additional users, which prevents the pattern selection.
7541 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7542 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7543 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7544 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7545 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7546 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7549 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7550 def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7551 (VPBROADCASTBrr (VMOVDI2PDIrr
7552 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7553 GR8:$src, sub_8bit))))>;
7554 def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7555 (VPBROADCASTBYrr (VMOVDI2PDIrr
7556 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7557 GR8:$src, sub_8bit))))>;
7559 def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7560 (VPBROADCASTWrr (VMOVDI2PDIrr
7561 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7562 GR16:$src, sub_16bit))))>;
7563 def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7564 (VPBROADCASTWYrr (VMOVDI2PDIrr
7565 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7566 GR16:$src, sub_16bit))))>;
7568 let Predicates = [HasAVX2, NoVLX] in {
7569 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7570 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7571 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7572 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7573 def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7574 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7575 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7576 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7579 // AVX1 broadcast patterns
7580 let Predicates = [HasAVX1Only] in {
7581 def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7582 (VBROADCASTSSYrm addr:$src)>;
7583 def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7584 (VBROADCASTSDYrm addr:$src)>;
7585 def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7586 (VBROADCASTSSrm addr:$src)>;
7589 // Provide fallback in case the load node that is used in the patterns above
7590 // is used by additional users, which prevents the pattern selection.
7591 let Predicates = [HasAVX, NoVLX] in {
7592 // 128bit broadcasts:
7593 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7594 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7595 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7596 (VMOVDDUPrm addr:$src)>;
7598 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7599 (VMOVDDUPrr VR128:$src)>;
7600 def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
7601 (VMOVDDUPrm addr:$src)>;
7602 def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
7603 (VMOVDDUPrm addr:$src)>;
7606 let Predicates = [HasAVX1Only] in {
7607 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7608 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7609 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7610 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7611 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7612 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7613 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7614 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7615 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7616 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7618 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7619 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7620 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7621 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7622 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7623 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7624 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7625 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7626 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7627 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7629 def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7630 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7631 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7632 (VMOVDDUPrm addr:$src)>;
7635 //===----------------------------------------------------------------------===//
7636 // VPERM - Permute instructions
7639 multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7640 ValueType OpVT, X86FoldableSchedWrite Sched,
7641 X86MemOperand memOp> {
7642 let Predicates = [HasAVX2, NoVLX] in {
7643 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7644 (ins VR256:$src1, VR256:$src2),
7645 !strconcat(OpcodeStr,
7646 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7648 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7649 Sched<[Sched]>, VEX_4V, VEX_L;
7650 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7651 (ins VR256:$src1, memOp:$src2),
7652 !strconcat(OpcodeStr,
7653 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7655 (OpVT (X86VPermv VR256:$src1,
7656 (load addr:$src2))))]>,
7657 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7661 defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7662 let ExeDomain = SSEPackedSingle in
7663 defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7665 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7666 ValueType OpVT, X86FoldableSchedWrite Sched,
7667 X86MemOperand memOp> {
7668 let Predicates = [HasAVX2, NoVLX] in {
7669 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7670 (ins VR256:$src1, u8imm:$src2),
7671 !strconcat(OpcodeStr,
7672 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7674 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7675 Sched<[Sched]>, VEX, VEX_L;
7676 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7677 (ins memOp:$src1, u8imm:$src2),
7678 !strconcat(OpcodeStr,
7679 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7681 (OpVT (X86VPermi (mem_frag addr:$src1),
7682 (i8 timm:$src2))))]>,
7683 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7687 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7688 WriteShuffle256, i256mem>, VEX_W;
7689 let ExeDomain = SSEPackedDouble in
7690 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7691 WriteFShuffle256, f256mem>, VEX_W;
7693 //===----------------------------------------------------------------------===//
7694 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7696 let isCommutable = 1 in
7697 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7698 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7699 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7700 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7701 (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
7703 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7704 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7705 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7706 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
7707 (i8 timm:$src3)))]>,
7708 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7710 let Predicates = [HasAVX2] in
7711 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7712 VR256:$src1, (i8 timm:$imm))),
7713 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7716 //===----------------------------------------------------------------------===//
7717 // VINSERTI128 - Insert packed integer values
7719 let hasSideEffects = 0 in {
7720 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7721 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7722 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7723 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7725 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7726 (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7727 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7728 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7731 let Predicates = [HasAVX2, NoVLX] in {
7732 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
7733 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>;
7734 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
7735 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>;
7738 //===----------------------------------------------------------------------===//
7739 // VEXTRACTI128 - Extract packed integer values
7741 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7742 (ins VR256:$src1, u8imm:$src2),
7743 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7744 Sched<[WriteShuffle256]>, VEX, VEX_L;
7745 let hasSideEffects = 0, mayStore = 1 in
7746 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7747 (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7748 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7749 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7751 let Predicates = [HasAVX2, NoVLX] in {
7752 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
7753 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
7754 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7755 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
7758 //===----------------------------------------------------------------------===//
7759 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7761 multiclass avx2_pmovmask<string OpcodeStr,
7762 Intrinsic IntLd128, Intrinsic IntLd256,
7763 Intrinsic IntSt128, Intrinsic IntSt256> {
7764 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7765 (ins VR128:$src1, i128mem:$src2),
7766 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7767 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7768 VEX_4V, Sched<[WriteVecMaskedLoad]>;
7769 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7770 (ins VR256:$src1, i256mem:$src2),
7771 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7772 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7773 VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
7774 def mr : AVX28I<0x8e, MRMDestMem, (outs),
7775 (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7776 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7777 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7778 VEX_4V, Sched<[WriteVecMaskedStore]>;
7779 def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7780 (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7781 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7782 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7783 VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
7786 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7787 int_x86_avx2_maskload_d,
7788 int_x86_avx2_maskload_d_256,
7789 int_x86_avx2_maskstore_d,
7790 int_x86_avx2_maskstore_d_256>;
7791 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7792 int_x86_avx2_maskload_q,
7793 int_x86_avx2_maskload_q_256,
7794 int_x86_avx2_maskstore_q,
7795 int_x86_avx2_maskstore_q_256>, VEX_W;
7797 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7800 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7801 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7803 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7804 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7805 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7806 (VT immAllZerosV))),
7807 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7809 let Predicates = [HasAVX] in {
7810 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7811 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7812 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7813 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7815 let Predicates = [HasAVX1Only] in {
7816 // load/store i32/i64 not supported use ps/pd version
7817 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7818 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7819 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7820 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7822 let Predicates = [HasAVX2] in {
7823 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7824 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7825 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7826 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7829 //===----------------------------------------------------------------------===//
7830 // SubVector Broadcasts
7831 // Provide fallback in case the load node that is used in the patterns above
7832 // is used by additional users, which prevents the pattern selection.
7834 let Predicates = [HasAVX, NoVLX] in {
7835 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
7836 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7837 (v2f64 VR128:$src), 1)>;
7838 def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
7839 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7840 (v4f32 VR128:$src), 1)>;
7843 // NOTE: We're using FP instructions here, but execution domain fixing can
7844 // convert to integer when profitable.
7845 let Predicates = [HasAVX, NoVLX] in {
7846 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
7847 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7848 (v2i64 VR128:$src), 1)>;
7849 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
7850 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7851 (v4i32 VR128:$src), 1)>;
7852 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
7853 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7854 (v8i16 VR128:$src), 1)>;
7855 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
7856 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7857 (v16i8 VR128:$src), 1)>;
7860 //===----------------------------------------------------------------------===//
7861 // Variable Bit Shifts
7863 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7864 ValueType vt128, ValueType vt256> {
7865 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7866 (ins VR128:$src1, VR128:$src2),
7867 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7869 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7870 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7871 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7872 (ins VR128:$src1, i128mem:$src2),
7873 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7875 (vt128 (OpNode VR128:$src1,
7876 (vt128 (load addr:$src2)))))]>,
7877 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7878 SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7879 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7880 (ins VR256:$src1, VR256:$src2),
7881 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7883 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7884 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7885 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7886 (ins VR256:$src1, i256mem:$src2),
7887 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7889 (vt256 (OpNode VR256:$src1,
7890 (vt256 (load addr:$src2)))))]>,
7891 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7892 SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7895 let Predicates = [HasAVX2, NoVLX] in {
7896 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7897 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7898 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7899 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
7900 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
7903 //===----------------------------------------------------------------------===//
7904 // VGATHER - GATHER Operations
7906 // FIXME: Improve scheduling of gather instructions.
7907 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
7908 ValueType VTy, PatFrag GatherNode128,
7909 PatFrag GatherNode256, RegisterClass RC256,
7910 X86MemOperand memop128, X86MemOperand memop256,
7911 ValueType MTx = VTx, ValueType MTy = VTy> {
7912 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
7913 (ins VR128:$src1, memop128:$src2, VR128:$mask),
7914 !strconcat(OpcodeStr,
7915 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7916 [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
7917 (GatherNode128 VR128:$src1, VR128:$mask,
7918 vectoraddr:$src2))]>,
7919 VEX, Sched<[WriteLoad]>;
7920 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
7921 (ins RC256:$src1, memop256:$src2, RC256:$mask),
7922 !strconcat(OpcodeStr,
7923 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7924 [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
7925 (GatherNode256 RC256:$src1, RC256:$mask,
7926 vectoraddr:$src2))]>,
7927 VEX, VEX_L, Sched<[WriteLoad]>;
7930 let Predicates = [HasAVX2] in {
7931 let mayLoad = 1, hasSideEffects = 0, Constraints
7932 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
7934 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
7935 mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
7936 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
7937 mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
7938 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
7939 mgatherv8i32, VR256, vx128mem, vy256mem>;
7940 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
7941 mgatherv4i64, VR128, vx64mem, vy128mem>;
7943 let ExeDomain = SSEPackedDouble in {
7944 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
7945 mgatherv4i32, VR256, vx128mem, vx256mem,
7946 v2i64, v4i64>, VEX_W;
7947 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
7948 mgatherv4i64, VR256, vx128mem, vy256mem,
7949 v2i64, v4i64>, VEX_W;
7952 let ExeDomain = SSEPackedSingle in {
7953 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
7954 mgatherv8i32, VR256, vx128mem, vy256mem,
7956 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
7957 mgatherv4i64, VR128, vx64mem, vy128mem,
7963 //===----------------------------------------------------------------------===//
7964 // GFNI instructions
7965 //===----------------------------------------------------------------------===//
7967 multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
7968 RegisterClass RC, PatFrag MemOpFrag,
7969 X86MemOperand X86MemOp, bit Is2Addr = 0> {
7970 let ExeDomain = SSEPackedInt,
7971 AsmString = !if(Is2Addr,
7972 OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
7973 OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7974 let isCommutable = 1 in
7975 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
7976 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
7977 Sched<[SchedWriteVecALU.XMM]>, T8PD;
7979 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
7980 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
7981 (MemOpFrag addr:$src2))))]>,
7982 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
7986 multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
7987 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
7988 X86MemOperand X86MemOp, bit Is2Addr = 0> {
7989 let AsmString = !if(Is2Addr,
7990 OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7991 OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
7992 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
7993 (ins RC:$src1, RC:$src2, u8imm:$src3), "",
7994 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
7995 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
7996 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
7997 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
7998 [(set RC:$dst, (OpVT (OpNode RC:$src1,
7999 (MemOpFrag addr:$src2),
8000 timm:$src3)))], SSEPackedInt>,
8001 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
8005 multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8006 let Constraints = "$src1 = $dst",
8007 Predicates = [HasGFNI, UseSSE2] in
8008 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8009 VR128, load, i128mem, 1>;
8010 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8011 defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
8012 load, i128mem>, VEX_4V, VEX_W;
8013 defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
8014 load, i256mem>, VEX_4V, VEX_L, VEX_W;
8019 let Constraints = "$src1 = $dst",
8020 Predicates = [HasGFNI, UseSSE2] in
8021 defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8023 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8024 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8026 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8027 i256mem>, VEX_4V, VEX_L;
8029 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
8030 let isCommutable = 0 in {
8031 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8032 X86GF2P8affineinvqb>, TAPD;
8033 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8034 X86GF2P8affineqb>, TAPD;