1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file describes the X86 SSE instruction set, defining the instructions,
10 // and properties of the instructions which are needed for code generation,
11 // machine code emission, and analysis.
13 //===----------------------------------------------------------------------===//
15 //===----------------------------------------------------------------------===//
16 // SSE 1 & 2 Instructions Classes
17 //===----------------------------------------------------------------------===//
19 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
21 RegisterClass RC, X86MemOperand x86memop,
22 Domain d, X86FoldableSchedWrite sched,
24 let isCodeGenOnly = 1 in {
25 let isCommutable = 1 in {
26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38 Sched<[sched.Folded, sched.ReadAfterFold]>;
42 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
44 SDPatternOperator OpNode, RegisterClass RC,
45 ValueType VT, string asm, Operand memopr,
46 PatFrags mem_frags, Domain d,
47 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48 let hasSideEffects = 0 in {
49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60 [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
61 Sched<[sched.Folded, sched.ReadAfterFold]>;
65 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
66 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
67 RegisterClass RC, ValueType vt,
68 X86MemOperand x86memop, PatFrag mem_frag,
69 Domain d, X86FoldableSchedWrite sched,
71 let isCommutable = 1 in
72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
85 Sched<[sched.Folded, sched.ReadAfterFold]>;
88 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90 string OpcodeStr, X86MemOperand x86memop,
91 X86FoldableSchedWrite sched,
92 list<dag> pat_rr, list<dag> pat_rm,
94 let isCommutable = 1, hasSideEffects = 0 in
95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
101 let hasSideEffects = 0, mayLoad = 1 in
102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
107 Sched<[sched.Folded, sched.ReadAfterFold]>;
111 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112 // This is expanded by ExpandPostRAPseudos.
113 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114 isPseudo = 1, SchedRW = [WriteZero] in {
115 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
116 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
117 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
118 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
119 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
120 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
123 //===----------------------------------------------------------------------===//
124 // AVX & SSE - Zero/One Vectors
125 //===----------------------------------------------------------------------===//
127 // Alias instruction that maps zero vector to pxor / xorp* for sse.
128 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
129 // swizzled by ExecutionDomainFix to pxor.
130 // We set canFoldAsLoad because this can be converted to a constant-pool
131 // load of an all-zeros value if folding it would be beneficial.
132 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
133 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
134 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
135 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
138 let Predicates = [NoAVX512] in {
139 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
140 def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
141 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
142 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
143 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
147 // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
148 // and doesn't need it because on sandy bridge the register is set to zero
149 // at the rename stage without using any execution unit, so SET0PSY
150 // and SET0PDY can be used for vector int instructions without penalty
151 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
152 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
153 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
154 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
157 let Predicates = [NoAVX512] in {
158 def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
159 def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
160 def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
161 def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
162 def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
165 // We set canFoldAsLoad because this can be converted to a constant-pool
166 // load of an all-ones value if folding it would be beneficial.
167 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
168 isPseudo = 1, SchedRW = [WriteZero] in {
169 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
170 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
171 let Predicates = [HasAVX1Only, OptForMinSize] in {
172 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
173 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
175 let Predicates = [HasAVX2] in
176 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
177 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
180 //===----------------------------------------------------------------------===//
181 // SSE 1 & 2 - Move FP Scalar Instructions
183 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
184 // register copies because it's a partial register update; Register-to-register
185 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
186 // that the insert be implementable in terms of a copy, and just mentioned, we
187 // don't use movss/movsd for copies.
188 //===----------------------------------------------------------------------===//
190 multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
191 X86MemOperand x86memop, string base_opc,
192 string asm_opr, Domain d, string Name> {
193 let isCommutable = 1 in
194 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
195 (ins VR128:$src1, VR128:$src2),
196 !strconcat(base_opc, asm_opr),
197 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
198 Sched<[SchedWriteFShuffle.XMM]>;
200 // For the disassembler
201 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
202 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
203 (ins VR128:$src1, VR128:$src2),
204 !strconcat(base_opc, asm_opr), []>,
205 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
208 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
209 X86MemOperand x86memop, string OpcodeStr,
210 Domain d, string Name, Predicate pred> {
212 let Predicates = [UseAVX, OptForSize] in
213 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
214 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
216 VEX_4V, VEX_LIG, VEX_WIG;
218 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
219 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
220 [(store RC:$src, addr:$dst)], d>,
221 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
223 let Constraints = "$src1 = $dst" in {
224 let Predicates = [pred, NoSSE41_Or_OptForSize] in
225 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
226 "\t{$src2, $dst|$dst, $src2}", d, Name>;
229 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
230 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
231 [(store RC:$src, addr:$dst)], d>,
232 Sched<[WriteFStore]>;
234 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
235 (!cast<Instruction>("V"#NAME#"rr_REV")
236 VR128:$dst, VR128:$src1, VR128:$src2), 0>;
237 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
238 (!cast<Instruction>(NAME#"rr_REV")
239 VR128:$dst, VR128:$src2), 0>;
242 // Loading from memory automatically zeroing upper bits.
243 multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
244 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
246 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
248 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
249 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
250 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
251 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
252 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
255 // _alt version uses FR32/FR64 register class.
256 let isCodeGenOnly = 1 in {
257 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
258 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
259 [(set RC:$dst, (mem_pat addr:$src))], d>,
260 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
261 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
262 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
263 [(set RC:$dst, (mem_pat addr:$src))], d>,
268 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
269 SSEPackedSingle, "MOVSS", UseSSE1>, XS;
270 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
271 SSEPackedDouble, "MOVSD", UseSSE2>, XD;
273 let canFoldAsLoad = 1, isReMaterializable = 1 in {
274 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
275 SSEPackedSingle>, XS;
276 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
277 SSEPackedDouble>, XD;
281 let Predicates = [UseAVX] in {
282 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
283 (VMOVSSrm addr:$src)>;
284 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
285 (VMOVSDrm addr:$src)>;
287 // Represent the same patterns above but in the form they appear for
289 def : Pat<(v8f32 (X86vzload32 addr:$src)),
290 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
291 def : Pat<(v4f64 (X86vzload64 addr:$src)),
292 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
295 let Predicates = [UseAVX, OptForSize] in {
296 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
297 // MOVSS to the lower bits.
298 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
299 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
300 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
301 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
303 // Move low f32 and clear high bits.
304 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
305 (SUBREG_TO_REG (i32 0),
306 (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
307 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
308 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
309 (SUBREG_TO_REG (i32 0),
310 (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
311 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
314 let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
315 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
316 // MOVSS to the lower bits.
317 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
318 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
319 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
320 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
323 let Predicates = [UseSSE2] in
324 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
325 (MOVSDrm addr:$src)>;
327 let Predicates = [UseSSE1] in
328 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
329 (MOVSSrm addr:$src)>;
331 //===----------------------------------------------------------------------===//
332 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
333 //===----------------------------------------------------------------------===//
335 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
336 X86MemOperand x86memop, PatFrag ld_frag,
337 string asm, Domain d,
338 X86SchedWriteMoveLS sched> {
339 let hasSideEffects = 0, isMoveReg = 1 in
340 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
341 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
343 let canFoldAsLoad = 1, isReMaterializable = 1 in
344 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
345 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
346 [(set RC:$dst, (ld_frag addr:$src))], d>,
350 let Predicates = [HasAVX, NoVLX] in {
351 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
352 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
354 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
355 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
357 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
358 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
360 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
361 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
364 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
365 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
366 PS, VEX, VEX_L, VEX_WIG;
367 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
368 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
369 PD, VEX, VEX_L, VEX_WIG;
370 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
371 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
372 PS, VEX, VEX_L, VEX_WIG;
373 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
374 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
375 PD, VEX, VEX_L, VEX_WIG;
378 let Predicates = [UseSSE1] in {
379 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
380 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
382 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
383 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
386 let Predicates = [UseSSE2] in {
387 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
388 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
390 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
391 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
395 let Predicates = [HasAVX, NoVLX] in {
396 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
397 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
398 "movaps\t{$src, $dst|$dst, $src}",
399 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
401 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
402 "movapd\t{$src, $dst|$dst, $src}",
403 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
405 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
406 "movups\t{$src, $dst|$dst, $src}",
407 [(store (v4f32 VR128:$src), addr:$dst)]>,
409 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
410 "movupd\t{$src, $dst|$dst, $src}",
411 [(store (v2f64 VR128:$src), addr:$dst)]>,
415 let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
416 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
417 "movaps\t{$src, $dst|$dst, $src}",
418 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
420 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
421 "movapd\t{$src, $dst|$dst, $src}",
422 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
424 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
425 "movups\t{$src, $dst|$dst, $src}",
426 [(store (v8f32 VR256:$src), addr:$dst)]>,
428 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
429 "movupd\t{$src, $dst|$dst, $src}",
430 [(store (v4f64 VR256:$src), addr:$dst)]>,
436 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
438 let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
439 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
441 "movaps\t{$src, $dst|$dst, $src}", []>,
442 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
443 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
445 "movapd\t{$src, $dst|$dst, $src}", []>,
446 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
447 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
449 "movups\t{$src, $dst|$dst, $src}", []>,
450 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
451 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
453 "movupd\t{$src, $dst|$dst, $src}", []>,
454 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
457 let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
458 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
460 "movaps\t{$src, $dst|$dst, $src}", []>,
461 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
462 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
464 "movapd\t{$src, $dst|$dst, $src}", []>,
465 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
466 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
468 "movups\t{$src, $dst|$dst, $src}", []>,
469 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
470 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
472 "movupd\t{$src, $dst|$dst, $src}", []>,
473 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
477 // Reversed version with ".s" suffix for GAS compatibility.
478 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
479 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
480 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
481 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
482 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
483 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
484 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
485 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
486 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
487 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
488 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
489 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
490 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
491 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
492 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
493 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
495 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
496 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
497 "movaps\t{$src, $dst|$dst, $src}",
498 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
499 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
500 "movapd\t{$src, $dst|$dst, $src}",
501 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
502 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
503 "movups\t{$src, $dst|$dst, $src}",
504 [(store (v4f32 VR128:$src), addr:$dst)]>;
505 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
506 "movupd\t{$src, $dst|$dst, $src}",
507 [(store (v2f64 VR128:$src), addr:$dst)]>;
511 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
512 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
513 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
514 "movaps\t{$src, $dst|$dst, $src}", []>,
515 FoldGenData<"MOVAPSrr">;
516 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
517 "movapd\t{$src, $dst|$dst, $src}", []>,
518 FoldGenData<"MOVAPDrr">;
519 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520 "movups\t{$src, $dst|$dst, $src}", []>,
521 FoldGenData<"MOVUPSrr">;
522 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
523 "movupd\t{$src, $dst|$dst, $src}", []>,
524 FoldGenData<"MOVUPDrr">;
527 // Reversed version with ".s" suffix for GAS compatibility.
528 def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
529 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
530 def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
531 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
532 def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
533 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
534 def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
535 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
537 let Predicates = [HasAVX, NoVLX] in {
538 // 256-bit load/store need to use floating point load/store in case we don't
539 // have AVX2. Execution domain fixing will convert to integer if AVX2 is
540 // available and changing the domain is beneficial.
541 def : Pat<(alignedloadv4i64 addr:$src),
542 (VMOVAPSYrm addr:$src)>;
543 def : Pat<(alignedloadv8i32 addr:$src),
544 (VMOVAPSYrm addr:$src)>;
545 def : Pat<(alignedloadv16i16 addr:$src),
546 (VMOVAPSYrm addr:$src)>;
547 def : Pat<(alignedloadv32i8 addr:$src),
548 (VMOVAPSYrm addr:$src)>;
549 def : Pat<(loadv4i64 addr:$src),
550 (VMOVUPSYrm addr:$src)>;
551 def : Pat<(loadv8i32 addr:$src),
552 (VMOVUPSYrm addr:$src)>;
553 def : Pat<(loadv16i16 addr:$src),
554 (VMOVUPSYrm addr:$src)>;
555 def : Pat<(loadv32i8 addr:$src),
556 (VMOVUPSYrm addr:$src)>;
558 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
559 (VMOVAPSYmr addr:$dst, VR256:$src)>;
560 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
561 (VMOVAPSYmr addr:$dst, VR256:$src)>;
562 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
563 (VMOVAPSYmr addr:$dst, VR256:$src)>;
564 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
565 (VMOVAPSYmr addr:$dst, VR256:$src)>;
566 def : Pat<(store (v4i64 VR256:$src), addr:$dst),
567 (VMOVUPSYmr addr:$dst, VR256:$src)>;
568 def : Pat<(store (v8i32 VR256:$src), addr:$dst),
569 (VMOVUPSYmr addr:$dst, VR256:$src)>;
570 def : Pat<(store (v16i16 VR256:$src), addr:$dst),
571 (VMOVUPSYmr addr:$dst, VR256:$src)>;
572 def : Pat<(store (v32i8 VR256:$src), addr:$dst),
573 (VMOVUPSYmr addr:$dst, VR256:$src)>;
576 // Use movaps / movups for SSE integer load / store (one byte shorter).
577 // The instructions selected below are then converted to MOVDQA/MOVDQU
578 // during the SSE domain pass.
579 let Predicates = [UseSSE1] in {
580 def : Pat<(alignedloadv2i64 addr:$src),
581 (MOVAPSrm addr:$src)>;
582 def : Pat<(alignedloadv4i32 addr:$src),
583 (MOVAPSrm addr:$src)>;
584 def : Pat<(alignedloadv8i16 addr:$src),
585 (MOVAPSrm addr:$src)>;
586 def : Pat<(alignedloadv16i8 addr:$src),
587 (MOVAPSrm addr:$src)>;
588 def : Pat<(loadv2i64 addr:$src),
589 (MOVUPSrm addr:$src)>;
590 def : Pat<(loadv4i32 addr:$src),
591 (MOVUPSrm addr:$src)>;
592 def : Pat<(loadv8i16 addr:$src),
593 (MOVUPSrm addr:$src)>;
594 def : Pat<(loadv16i8 addr:$src),
595 (MOVUPSrm addr:$src)>;
597 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
598 (MOVAPSmr addr:$dst, VR128:$src)>;
599 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
600 (MOVAPSmr addr:$dst, VR128:$src)>;
601 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
602 (MOVAPSmr addr:$dst, VR128:$src)>;
603 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
604 (MOVAPSmr addr:$dst, VR128:$src)>;
605 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
606 (MOVUPSmr addr:$dst, VR128:$src)>;
607 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
608 (MOVUPSmr addr:$dst, VR128:$src)>;
609 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
610 (MOVUPSmr addr:$dst, VR128:$src)>;
611 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
612 (MOVUPSmr addr:$dst, VR128:$src)>;
615 //===----------------------------------------------------------------------===//
616 // SSE 1 & 2 - Move Low packed FP Instructions
617 //===----------------------------------------------------------------------===//
619 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
620 string base_opc, string asm_opr> {
621 // No pattern as they need be special cased between high and low.
622 let hasSideEffects = 0, mayLoad = 1 in
623 def PSrm : PI<opc, MRMSrcMem,
624 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
625 !strconcat(base_opc, "s", asm_opr),
626 [], SSEPackedSingle>, PS,
627 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
629 def PDrm : PI<opc, MRMSrcMem,
630 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
631 !strconcat(base_opc, "d", asm_opr),
632 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
633 (scalar_to_vector (loadf64 addr:$src2)))))],
634 SSEPackedDouble>, PD,
635 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
638 multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
640 let Predicates = [UseAVX] in
641 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
642 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
645 let Constraints = "$src1 = $dst" in
646 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
647 "\t{$src2, $dst|$dst, $src2}">;
650 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
652 let SchedRW = [WriteFStore] in {
653 let Predicates = [UseAVX] in {
654 let mayStore = 1, hasSideEffects = 0 in
655 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
656 "movlps\t{$src, $dst|$dst, $src}",
659 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
660 "movlpd\t{$src, $dst|$dst, $src}",
661 [(store (f64 (extractelt (v2f64 VR128:$src),
662 (iPTR 0))), addr:$dst)]>,
665 let mayStore = 1, hasSideEffects = 0 in
666 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
667 "movlps\t{$src, $dst|$dst, $src}",
669 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
670 "movlpd\t{$src, $dst|$dst, $src}",
671 [(store (f64 (extractelt (v2f64 VR128:$src),
672 (iPTR 0))), addr:$dst)]>;
675 let Predicates = [UseSSE1] in {
676 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
677 // end up with a movsd or blend instead of shufp.
678 // No need for aligned load, we're only loading 64-bits.
679 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
681 (MOVLPSrm VR128:$src1, addr:$src2)>;
682 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
683 (MOVLPSrm VR128:$src1, addr:$src2)>;
685 def : Pat<(v4f32 (X86vzload64 addr:$src)),
686 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
687 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
688 (MOVLPSmr addr:$dst, VR128:$src)>;
691 //===----------------------------------------------------------------------===//
692 // SSE 1 & 2 - Move Hi packed FP Instructions
693 //===----------------------------------------------------------------------===//
695 defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
697 let SchedRW = [WriteFStore] in {
698 // v2f64 extract element 1 is always custom lowered to unpack high to low
699 // and extract element 0 so the non-store version isn't too horrible.
700 let Predicates = [UseAVX] in {
701 let mayStore = 1, hasSideEffects = 0 in
702 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
703 "movhps\t{$src, $dst|$dst, $src}",
705 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
706 "movhpd\t{$src, $dst|$dst, $src}",
707 [(store (f64 (extractelt
708 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
709 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
711 let mayStore = 1, hasSideEffects = 0 in
712 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
713 "movhps\t{$src, $dst|$dst, $src}",
715 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
716 "movhpd\t{$src, $dst|$dst, $src}",
717 [(store (f64 (extractelt
718 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
719 (iPTR 0))), addr:$dst)]>;
722 let Predicates = [UseAVX] in {
724 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
725 (VMOVHPDrm VR128:$src1, addr:$src2)>;
727 def : Pat<(store (f64 (extractelt
728 (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
729 (iPTR 0))), addr:$dst),
730 (VMOVHPDmr addr:$dst, VR128:$src)>;
733 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
734 (VMOVLPDrm VR128:$src1, addr:$src2)>;
737 let Predicates = [UseSSE1] in {
738 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
739 // end up with a movsd or blend instead of shufp.
740 // No need for aligned load, we're only loading 64-bits.
741 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
742 (MOVHPSrm VR128:$src1, addr:$src2)>;
743 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
744 (MOVHPSrm VR128:$src1, addr:$src2)>;
746 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
748 (MOVHPSmr addr:$dst, VR128:$src)>;
751 let Predicates = [UseSSE2] in {
753 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
754 (MOVHPDrm VR128:$src1, addr:$src2)>;
756 def : Pat<(store (f64 (extractelt
757 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
758 (iPTR 0))), addr:$dst),
759 (MOVHPDmr addr:$dst, VR128:$src)>;
762 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
763 (MOVLPDrm VR128:$src1, addr:$src2)>;
766 let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
767 // Use MOVLPD to load into the low bits from a full vector unless we can use
769 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
770 (MOVLPDrm VR128:$src1, addr:$src2)>;
773 //===----------------------------------------------------------------------===//
774 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
775 //===----------------------------------------------------------------------===//
777 let Predicates = [UseAVX] in {
778 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
779 (ins VR128:$src1, VR128:$src2),
780 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
782 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
783 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
784 let isCommutable = 1 in
785 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
786 (ins VR128:$src1, VR128:$src2),
787 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
789 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
790 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
793 let Constraints = "$src1 = $dst" in {
794 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
795 (ins VR128:$src1, VR128:$src2),
796 "movlhps\t{$src2, $dst|$dst, $src2}",
798 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
799 Sched<[SchedWriteFShuffle.XMM]>;
800 let isCommutable = 1 in
801 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
802 (ins VR128:$src1, VR128:$src2),
803 "movhlps\t{$src2, $dst|$dst, $src2}",
805 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
806 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
809 //===----------------------------------------------------------------------===//
810 // SSE 1 & 2 - Conversion Instructions
811 //===----------------------------------------------------------------------===//
813 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
814 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
815 string asm, string mem, X86FoldableSchedWrite sched,
817 SchedRead Int2Fpu = ReadDefault> {
818 let ExeDomain = d in {
819 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
820 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
821 [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
822 Sched<[sched, Int2Fpu]>;
823 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
824 mem#"\t{$src, $dst|$dst, $src}",
825 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
826 Sched<[sched.Folded]>;
830 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
831 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
832 string asm, Domain d, X86FoldableSchedWrite sched> {
833 let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
834 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
835 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
838 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
839 [(set RC:$dst, (DstTy (any_sint_to_fp
840 (SrcTy (ld_frag addr:$src)))))], d>,
841 Sched<[sched.Folded]>;
845 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
846 X86MemOperand x86memop, string asm, string mem,
847 X86FoldableSchedWrite sched, Domain d> {
848 let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
849 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
850 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
851 Sched<[sched, ReadDefault, ReadInt2Fpu]>;
853 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
854 (ins DstRC:$src1, x86memop:$src),
855 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
856 Sched<[sched.Folded, sched.ReadAfterFold]>;
857 } // hasSideEffects = 0
860 let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
861 defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
862 "cvttss2si", "cvttss2si",
863 WriteCvtSS2I, SSEPackedSingle>,
865 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
866 "cvttss2si", "cvttss2si",
867 WriteCvtSS2I, SSEPackedSingle>,
868 XS, VEX, VEX_W, VEX_LIG;
869 defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
870 "cvttsd2si", "cvttsd2si",
871 WriteCvtSD2I, SSEPackedDouble>,
873 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
874 "cvttsd2si", "cvttsd2si",
875 WriteCvtSD2I, SSEPackedDouble>,
876 XD, VEX, VEX_W, VEX_LIG;
878 defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
879 "cvtss2si", "cvtss2si",
880 WriteCvtSS2I, SSEPackedSingle>,
882 defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
883 "cvtss2si", "cvtss2si",
884 WriteCvtSS2I, SSEPackedSingle>,
885 XS, VEX, VEX_W, VEX_LIG;
886 defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
887 "cvtsd2si", "cvtsd2si",
888 WriteCvtSD2I, SSEPackedDouble>,
890 defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
891 "cvtsd2si", "cvtsd2si",
892 WriteCvtSD2I, SSEPackedDouble>,
893 XD, VEX, VEX_W, VEX_LIG;
896 // The assembler can recognize rr 64-bit instructions by seeing a rxx
897 // register, but the same isn't true when only using memory operands,
898 // provide other assembly "l" and "q" forms to address this explicitly
899 // where appropriate to do so.
900 let isCodeGenOnly = 1 in {
901 defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
902 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
904 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
905 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
906 VEX_W, VEX_LIG, SIMD_EXC;
907 defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
908 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
910 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
911 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
912 VEX_W, VEX_LIG, SIMD_EXC;
913 } // isCodeGenOnly = 1
915 let Predicates = [UseAVX] in {
916 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
917 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
918 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
919 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
920 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
921 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
922 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
923 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
925 def : Pat<(f32 (any_sint_to_fp GR32:$src)),
926 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
927 def : Pat<(f32 (any_sint_to_fp GR64:$src)),
928 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
929 def : Pat<(f64 (any_sint_to_fp GR32:$src)),
930 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
931 def : Pat<(f64 (any_sint_to_fp GR64:$src)),
932 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
934 def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
935 def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
937 def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
938 def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
941 let isCodeGenOnly = 1 in {
942 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
943 "cvttss2si", "cvttss2si",
944 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
945 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
946 "cvttss2si", "cvttss2si",
947 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
948 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
949 "cvttsd2si", "cvttsd2si",
950 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
951 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
952 "cvttsd2si", "cvttsd2si",
953 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
955 defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
956 "cvtss2si", "cvtss2si",
957 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
958 defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
959 "cvtss2si", "cvtss2si",
960 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
961 defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
962 "cvtsd2si", "cvtsd2si",
963 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
964 defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
965 "cvtsd2si", "cvtsd2si",
966 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
968 defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
969 "cvtsi2ss", "cvtsi2ss{l}",
970 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
971 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
972 "cvtsi2ss", "cvtsi2ss{q}",
973 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
974 defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
975 "cvtsi2sd", "cvtsi2sd{l}",
976 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
977 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
978 "cvtsi2sd", "cvtsi2sd{q}",
979 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
980 } // isCodeGenOnly = 1
982 let Predicates = [UseSSE1] in {
983 def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
984 def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
987 let Predicates = [UseSSE2] in {
988 def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
989 def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
992 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
993 // and/or XMM operand(s).
995 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
996 ValueType DstVT, ValueType SrcVT, SDNode OpNode,
997 Operand memop, PatFrags mem_frags, string asm,
998 X86FoldableSchedWrite sched, Domain d> {
999 let ExeDomain = d in {
1000 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1001 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1002 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1004 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1005 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1006 [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
1007 Sched<[sched.Folded]>;
1011 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1012 RegisterClass DstRC, X86MemOperand x86memop,
1013 string asm, string mem, X86FoldableSchedWrite sched,
1014 Domain d, bit Is2Addr = 1> {
1015 let hasSideEffects = 0, ExeDomain = d in {
1016 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1018 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1019 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1020 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
1022 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1023 (ins DstRC:$src1, x86memop:$src2),
1025 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
1026 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
1027 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1031 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1032 let Predicates = [UseAVX] in {
1033 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1034 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1035 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1036 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1037 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1038 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
1040 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1041 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1042 SSEPackedDouble>, XD;
1043 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1044 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1045 SSEPackedDouble>, XD, REX_W;
1048 let Predicates = [UseAVX] in {
1049 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1050 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1051 XS, VEX_4V, VEX_LIG, SIMD_EXC;
1052 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1053 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1054 XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1055 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1056 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1057 XD, VEX_4V, VEX_LIG;
1058 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1059 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1060 XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1062 let Constraints = "$src1 = $dst" in {
1063 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1064 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1066 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1067 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1068 XS, REX_W, SIMD_EXC;
1069 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1070 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1072 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1073 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1074 XD, REX_W, SIMD_EXC;
1077 def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1078 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1079 def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1080 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1081 def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1082 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1083 def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1084 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1086 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1087 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1088 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1089 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1091 def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1092 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1093 def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1094 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1095 def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1096 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1097 def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1098 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1100 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1101 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1102 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1103 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1107 // Aliases for intrinsics
1108 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1109 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1110 ssmem, sse_load_f32, "cvttss2si",
1111 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1112 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1113 X86cvtts2Int, ssmem, sse_load_f32,
1114 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1115 XS, VEX, VEX_LIG, VEX_W;
1116 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1117 sdmem, sse_load_f64, "cvttsd2si",
1118 WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1119 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1120 X86cvtts2Int, sdmem, sse_load_f64,
1121 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1122 XD, VEX, VEX_LIG, VEX_W;
1124 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1125 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1126 ssmem, sse_load_f32, "cvttss2si",
1127 WriteCvtSS2I, SSEPackedSingle>, XS;
1128 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1129 X86cvtts2Int, ssmem, sse_load_f32,
1130 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1132 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1133 sdmem, sse_load_f64, "cvttsd2si",
1134 WriteCvtSD2I, SSEPackedDouble>, XD;
1135 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1136 X86cvtts2Int, sdmem, sse_load_f64,
1137 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1141 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1142 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1143 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1144 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1145 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1146 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1147 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1148 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1149 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1150 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1151 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1152 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1153 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1154 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1155 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1156 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1158 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1159 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1160 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1161 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1162 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1163 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1164 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1165 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1166 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1167 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1168 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1169 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1170 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1171 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1172 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1173 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1175 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1176 defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1177 ssmem, sse_load_f32, "cvtss2si",
1178 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1179 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1180 ssmem, sse_load_f32, "cvtss2si",
1181 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
1183 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1184 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1185 ssmem, sse_load_f32, "cvtss2si",
1186 WriteCvtSS2I, SSEPackedSingle>, XS;
1187 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1188 ssmem, sse_load_f32, "cvtss2si",
1189 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1191 defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1192 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1193 SSEPackedSingle, WriteCvtI2PS>,
1194 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1195 defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1196 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1197 SSEPackedSingle, WriteCvtI2PSY>,
1198 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1200 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1201 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1202 SSEPackedSingle, WriteCvtI2PS>,
1203 PS, Requires<[UseSSE2]>;
1207 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1208 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1209 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1210 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1211 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1212 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1213 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1214 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1215 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1216 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1217 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1218 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1219 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1220 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1221 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1222 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1225 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1226 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1227 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1228 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1229 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1230 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1231 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1232 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1233 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1234 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1235 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1236 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1237 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1238 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1239 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1240 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1244 // Convert scalar double to scalar single
1245 let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
1246 def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1247 (ins FR32:$src1, FR64:$src2),
1248 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1249 VEX_4V, VEX_LIG, VEX_WIG,
1250 Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1252 def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1253 (ins FR32:$src1, f64mem:$src2),
1254 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1255 XD, VEX_4V, VEX_LIG, VEX_WIG,
1256 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1259 def : Pat<(f32 (any_fpround FR64:$src)),
1260 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1263 let isCodeGenOnly = 1 in {
1264 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1265 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1266 [(set FR32:$dst, (any_fpround FR64:$src))]>,
1267 Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1268 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1269 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1270 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1271 XD, Requires<[UseSSE2, OptForSize]>,
1272 Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
1275 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1276 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1277 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1278 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1280 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1281 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1282 Sched<[WriteCvtSD2SS]>;
1283 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1284 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1285 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1287 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1288 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1289 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1290 let Constraints = "$src1 = $dst" in {
1291 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1292 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1293 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1295 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1296 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1297 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1298 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1299 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1301 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1302 XD, Requires<[UseSSE2]>,
1303 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1307 // Convert scalar single to scalar double
1308 // SSE2 instructions with XS prefix
1309 let isCodeGenOnly = 1, hasSideEffects = 0 in {
1310 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1311 (ins FR64:$src1, FR32:$src2),
1312 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1313 XS, VEX_4V, VEX_LIG, VEX_WIG,
1314 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1316 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1317 (ins FR64:$src1, f32mem:$src2),
1318 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1319 XS, VEX_4V, VEX_LIG, VEX_WIG,
1320 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1321 Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1322 } // isCodeGenOnly = 1, hasSideEffects = 0
1324 def : Pat<(f64 (any_fpextend FR32:$src)),
1325 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1326 def : Pat<(any_fpextend (loadf32 addr:$src)),
1327 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1329 let isCodeGenOnly = 1 in {
1330 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1331 "cvtss2sd\t{$src, $dst|$dst, $src}",
1332 [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1333 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1334 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1335 "cvtss2sd\t{$src, $dst|$dst, $src}",
1336 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1337 XS, Requires<[UseSSE2, OptForSize]>,
1338 Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
1339 } // isCodeGenOnly = 1
1341 let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
1342 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1343 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1344 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1345 []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1346 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1348 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1349 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1350 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1351 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1352 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1353 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1354 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1355 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1356 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1357 []>, XS, Requires<[UseSSE2]>,
1358 Sched<[WriteCvtSS2SD]>;
1360 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1361 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1362 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1363 []>, XS, Requires<[UseSSE2]>,
1364 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1366 } // hasSideEffects = 0
1368 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1369 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1370 // vmovs{s,d} instructions
1371 let Predicates = [UseAVX] in {
1372 def : Pat<(v4f32 (X86Movss
1374 (v4f32 (scalar_to_vector
1375 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1376 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1378 def : Pat<(v2f64 (X86Movsd
1380 (v2f64 (scalar_to_vector
1381 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1382 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1384 def : Pat<(v4f32 (X86Movss
1386 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1387 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1389 def : Pat<(v4f32 (X86Movss
1391 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1392 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1394 def : Pat<(v4f32 (X86Movss
1396 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1397 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1399 def : Pat<(v4f32 (X86Movss
1401 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1402 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1404 def : Pat<(v2f64 (X86Movsd
1406 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1407 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1409 def : Pat<(v2f64 (X86Movsd
1411 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1412 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1414 def : Pat<(v2f64 (X86Movsd
1416 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1417 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1419 def : Pat<(v2f64 (X86Movsd
1421 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1422 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1423 } // Predicates = [UseAVX]
1425 let Predicates = [UseSSE2] in {
1426 def : Pat<(v4f32 (X86Movss
1428 (v4f32 (scalar_to_vector
1429 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1430 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1432 def : Pat<(v2f64 (X86Movsd
1434 (v2f64 (scalar_to_vector
1435 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1436 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1438 def : Pat<(v2f64 (X86Movsd
1440 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1441 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1443 def : Pat<(v2f64 (X86Movsd
1445 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1446 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1448 def : Pat<(v2f64 (X86Movsd
1450 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1451 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1453 def : Pat<(v2f64 (X86Movsd
1455 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1456 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1457 } // Predicates = [UseSSE2]
1459 let Predicates = [UseSSE1] in {
1460 def : Pat<(v4f32 (X86Movss
1462 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1463 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1465 def : Pat<(v4f32 (X86Movss
1467 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1468 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1470 def : Pat<(v4f32 (X86Movss
1472 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1473 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1475 def : Pat<(v4f32 (X86Movss
1477 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1478 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1479 } // Predicates = [UseSSE1]
1481 let Predicates = [HasAVX, NoVLX] in {
1482 // Convert packed single/double fp to doubleword
1483 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1484 "cvtps2dq\t{$src, $dst|$dst, $src}",
1485 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1486 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
1487 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1488 "cvtps2dq\t{$src, $dst|$dst, $src}",
1490 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1491 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
1492 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1493 "cvtps2dq\t{$src, $dst|$dst, $src}",
1495 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1496 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
1497 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1498 "cvtps2dq\t{$src, $dst|$dst, $src}",
1500 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1501 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
1503 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1504 "cvtps2dq\t{$src, $dst|$dst, $src}",
1505 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1506 Sched<[WriteCvtPS2I]>, SIMD_EXC;
1507 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1508 "cvtps2dq\t{$src, $dst|$dst, $src}",
1510 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1511 Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1514 // Convert Packed Double FP to Packed DW Integers
1515 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1516 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1517 // register, but the same isn't true when using memory operands instead.
1518 // Provide other assembly rr and rm forms to address this explicitly.
1519 def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1520 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1522 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1523 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1526 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1527 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1529 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1530 Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1533 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1534 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1536 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1537 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1538 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1539 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1541 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1542 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1545 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1546 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1547 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1548 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1550 def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1551 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1553 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1554 Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1555 def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1556 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1558 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1559 Sched<[WriteCvtPD2I]>, SIMD_EXC;
1561 // Convert with truncation packed single/double fp to doubleword
1562 // SSE2 packed instructions with XS prefix
1563 let Uses = [MXCSR], mayRaiseFPException = 1 in {
1564 let Predicates = [HasAVX, NoVLX] in {
1565 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1566 "cvttps2dq\t{$src, $dst|$dst, $src}",
1568 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1569 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1570 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1571 "cvttps2dq\t{$src, $dst|$dst, $src}",
1573 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1574 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1575 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1576 "cvttps2dq\t{$src, $dst|$dst, $src}",
1578 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1579 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1580 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1581 "cvttps2dq\t{$src, $dst|$dst, $src}",
1583 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1585 Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1588 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1589 "cvttps2dq\t{$src, $dst|$dst, $src}",
1591 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1592 Sched<[WriteCvtPS2I]>;
1593 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1594 "cvttps2dq\t{$src, $dst|$dst, $src}",
1596 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1597 Sched<[WriteCvtPS2ILd]>;
1600 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1601 // register, but the same isn't true when using memory operands instead.
1602 // Provide other assembly rr and rm forms to address this explicitly.
1603 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1605 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1606 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1608 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1609 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1610 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1611 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1613 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1614 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1617 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1618 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1620 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1621 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1622 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1623 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1625 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1626 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1627 } // Predicates = [HasAVX, NoVLX]
1629 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1630 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1631 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1632 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1634 let Predicates = [HasAVX, NoVLX] in {
1635 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1636 (VCVTTPD2DQYrr VR256:$src)>;
1637 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1638 (VCVTTPD2DQYrm addr:$src)>;
1641 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1642 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1644 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1645 Sched<[WriteCvtPD2I]>, SIMD_EXC;
1646 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1647 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1649 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1650 Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1652 // Convert packed single to packed double
1653 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1654 // SSE2 instructions without OpSize prefix
1655 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1656 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1657 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1658 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1659 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1660 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1661 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1662 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1663 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1664 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1665 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1666 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1667 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1668 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1669 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1670 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1673 let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1674 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1675 "cvtps2pd\t{$src, $dst|$dst, $src}",
1676 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1677 PS, Sched<[WriteCvtPS2PD]>;
1678 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1679 "cvtps2pd\t{$src, $dst|$dst, $src}",
1680 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1681 PS, Sched<[WriteCvtPS2PD.Folded]>;
1684 // Convert Packed DW Integers to Packed Double FP
1685 let Predicates = [HasAVX, NoVLX] in {
1686 let hasSideEffects = 0, mayLoad = 1 in
1687 def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1688 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1690 (v2f64 (X86any_VSintToFP
1692 (v2i64 (scalar_to_vector
1693 (loadi64 addr:$src)))))))]>,
1694 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1695 def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1696 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1698 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1699 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1700 def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1701 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1703 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1704 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1706 def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1707 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1709 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1710 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1713 let hasSideEffects = 0, mayLoad = 1 in
1714 def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1715 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1717 (v2f64 (X86any_VSintToFP
1719 (v2i64 (scalar_to_vector
1720 (loadi64 addr:$src)))))))]>,
1721 Sched<[WriteCvtI2PDLd]>;
1722 def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1723 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1725 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1726 Sched<[WriteCvtI2PD]>;
1728 // AVX register conversion intrinsics
1729 let Predicates = [HasAVX, NoVLX] in {
1730 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1731 (VCVTDQ2PDrm addr:$src)>;
1732 } // Predicates = [HasAVX, NoVLX]
1734 // SSE2 register conversion intrinsics
1735 let Predicates = [UseSSE2] in {
1736 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1737 (CVTDQ2PDrm addr:$src)>;
1738 } // Predicates = [UseSSE2]
1740 // Convert packed double to packed single
1741 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1742 // register, but the same isn't true when using memory operands instead.
1743 // Provide other assembly rr and rm forms to address this explicitly.
1744 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1746 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1747 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1748 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
1749 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1750 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1751 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1752 [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>,
1753 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1755 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1756 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1757 [(set VR128:$dst, (X86any_vfpround VR256:$src))]>,
1758 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1759 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1760 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1761 [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>,
1762 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1763 } // Predicates = [HasAVX, NoVLX]
1765 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1766 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1767 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1768 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1770 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1771 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1772 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
1773 Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1774 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1775 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1776 [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>,
1777 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1779 //===----------------------------------------------------------------------===//
1780 // SSE 1 & 2 - Compare Instructions
1781 //===----------------------------------------------------------------------===//
1783 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1784 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1785 Operand memop, SDNode OpNode, ValueType VT,
1786 PatFrag ld_frag, string asm,
1787 X86FoldableSchedWrite sched,
1788 PatFrags mem_frags> {
1789 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1790 (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
1791 [(set VR128:$dst, (OpNode (VT VR128:$src1),
1792 VR128:$src2, timm:$cc))]>,
1793 Sched<[sched]>, SIMD_EXC;
1795 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1796 (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
1797 [(set VR128:$dst, (OpNode (VT VR128:$src1),
1798 (mem_frags addr:$src2), timm:$cc))]>,
1799 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1801 let isCodeGenOnly = 1 in {
1802 let isCommutable = 1 in
1803 def rr : SIi8<0xC2, MRMSrcReg,
1804 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1805 [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
1806 Sched<[sched]>, SIMD_EXC;
1807 def rm : SIi8<0xC2, MRMSrcMem,
1808 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1809 [(set RC:$dst, (OpNode RC:$src1,
1810 (ld_frag addr:$src2), timm:$cc))]>,
1811 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1815 let ExeDomain = SSEPackedSingle in
1816 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1817 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1818 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1819 XS, VEX_4V, VEX_LIG, VEX_WIG;
1820 let ExeDomain = SSEPackedDouble in
1821 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1822 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1823 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1824 XD, VEX_4V, VEX_LIG, VEX_WIG;
1826 let Constraints = "$src1 = $dst" in {
1827 let ExeDomain = SSEPackedSingle in
1828 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1829 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1830 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1831 let ExeDomain = SSEPackedDouble in
1832 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1833 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1834 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1837 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1838 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1839 ValueType vt, X86MemOperand x86memop,
1840 PatFrag ld_frag, string OpcodeStr, Domain d,
1841 X86FoldableSchedWrite sched = WriteFComX> {
1842 let ExeDomain = d in {
1843 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1844 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1845 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1846 Sched<[sched]>, SIMD_EXC;
1848 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1849 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1850 [(set EFLAGS, (OpNode (vt RC:$src1),
1851 (ld_frag addr:$src2)))]>,
1852 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1856 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1857 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1858 ValueType vt, Operand memop,
1859 PatFrags mem_frags, string OpcodeStr,
1861 X86FoldableSchedWrite sched = WriteFComX> {
1862 let ExeDomain = d in {
1863 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1864 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1865 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1866 Sched<[sched]>, SIMD_EXC;
1868 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1869 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1870 [(set EFLAGS, (OpNode (vt RC:$src1),
1871 (mem_frags addr:$src2)))]>,
1872 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1876 let Defs = [EFLAGS] in {
1877 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1878 "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1879 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1880 "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1881 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1882 "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1883 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1884 "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1886 let isCodeGenOnly = 1 in {
1887 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1888 sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1889 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1890 sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1892 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1893 sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1894 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1895 sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1897 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1898 "ucomiss", SSEPackedSingle>, PS;
1899 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1900 "ucomisd", SSEPackedDouble>, PD;
1901 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1902 "comiss", SSEPackedSingle>, PS;
1903 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1904 "comisd", SSEPackedDouble>, PD;
1906 let isCodeGenOnly = 1 in {
1907 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1908 sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1909 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1910 sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1912 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1913 sse_load_f32, "comiss", SSEPackedSingle>, PS;
1914 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1915 sse_load_f64, "comisd", SSEPackedDouble>, PD;
1917 } // Defs = [EFLAGS]
1919 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
1920 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1921 ValueType VT, string asm,
1922 X86FoldableSchedWrite sched,
1923 Domain d, PatFrag ld_frag> {
1924 let isCommutable = 1 in
1925 def rri : PIi8<0xC2, MRMSrcReg,
1926 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1927 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1928 Sched<[sched]>, SIMD_EXC;
1929 def rmi : PIi8<0xC2, MRMSrcMem,
1930 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1932 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1933 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1936 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1937 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1938 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1939 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1940 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1941 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1942 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1943 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1944 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1945 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1946 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1947 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1948 let Constraints = "$src1 = $dst" in {
1949 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1950 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1951 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1952 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1953 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1954 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1957 def CommutableCMPCC : PatLeaf<(timm), [{
1958 uint64_t Imm = N->getZExtValue() & 0x7;
1959 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1962 // Patterns to select compares with loads in first operand.
1963 let Predicates = [HasAVX] in {
1964 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
1965 CommutableCMPCC:$cc)),
1966 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1968 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
1969 CommutableCMPCC:$cc)),
1970 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1972 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
1973 CommutableCMPCC:$cc)),
1974 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1976 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
1977 CommutableCMPCC:$cc)),
1978 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
1980 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1981 CommutableCMPCC:$cc)),
1982 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1984 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1985 CommutableCMPCC:$cc)),
1986 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
1989 let Predicates = [UseSSE2] in {
1990 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
1991 CommutableCMPCC:$cc)),
1992 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1994 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1995 CommutableCMPCC:$cc)),
1996 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1999 let Predicates = [UseSSE1] in {
2000 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
2001 CommutableCMPCC:$cc)),
2002 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2004 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2005 CommutableCMPCC:$cc)),
2006 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2009 //===----------------------------------------------------------------------===//
2010 // SSE 1 & 2 - Shuffle Instructions
2011 //===----------------------------------------------------------------------===//
2013 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2014 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2015 ValueType vt, string asm, PatFrag mem_frag,
2016 X86FoldableSchedWrite sched, Domain d,
2017 bit IsCommutable = 0> {
2018 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2019 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2020 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2021 (i8 timm:$src3))))], d>,
2022 Sched<[sched.Folded, sched.ReadAfterFold]>;
2023 let isCommutable = IsCommutable in
2024 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2025 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2026 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2027 (i8 timm:$src3))))], d>,
2031 let Predicates = [HasAVX, NoVLX] in {
2032 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2033 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2034 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2035 PS, VEX_4V, VEX_WIG;
2036 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2037 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2038 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2039 PS, VEX_4V, VEX_L, VEX_WIG;
2040 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2041 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2042 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2043 PD, VEX_4V, VEX_WIG;
2044 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2045 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2046 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2047 PD, VEX_4V, VEX_L, VEX_WIG;
2049 let Constraints = "$src1 = $dst" in {
2050 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2051 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2052 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2053 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2054 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2055 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2058 //===----------------------------------------------------------------------===//
2059 // SSE 1 & 2 - Unpack FP Instructions
2060 //===----------------------------------------------------------------------===//
2062 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2063 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2064 PatFrag mem_frag, RegisterClass RC,
2065 X86MemOperand x86memop, string asm,
2066 X86FoldableSchedWrite sched, Domain d,
2067 bit IsCommutable = 0> {
2068 let isCommutable = IsCommutable in
2069 def rr : PI<opc, MRMSrcReg,
2070 (outs RC:$dst), (ins RC:$src1, RC:$src2),
2072 (vt (OpNode RC:$src1, RC:$src2)))], d>,
2074 def rm : PI<opc, MRMSrcMem,
2075 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2077 (vt (OpNode RC:$src1,
2078 (mem_frag addr:$src2))))], d>,
2079 Sched<[sched.Folded, sched.ReadAfterFold]>;
2082 let Predicates = [HasAVX, NoVLX] in {
2083 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2084 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2085 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2086 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2087 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2088 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2089 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2090 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2091 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2092 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2093 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2094 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2096 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2097 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2098 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2099 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2100 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2101 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2102 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2103 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2104 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2105 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2106 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2107 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2108 }// Predicates = [HasAVX, NoVLX]
2110 let Constraints = "$src1 = $dst" in {
2111 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2112 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2113 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2114 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2115 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2116 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2117 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2118 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2119 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2120 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2121 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2122 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2123 } // Constraints = "$src1 = $dst"
2125 let Predicates = [HasAVX1Only] in {
2126 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2127 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2128 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2129 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2130 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2131 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2132 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2133 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2135 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2136 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2137 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2138 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2139 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2140 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2141 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2142 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2145 let Predicates = [UseSSE2] in {
2146 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2147 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2148 (v2f64 (simple_load addr:$src2)))),
2149 (MOVHPDrm VR128:$src1, addr:$src2)>;
2152 //===----------------------------------------------------------------------===//
2153 // SSE 1 & 2 - Extract Floating-Point Sign mask
2154 //===----------------------------------------------------------------------===//
2156 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2157 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2158 string asm, Domain d> {
2159 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2160 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2161 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2162 Sched<[WriteFMOVMSK]>;
2165 let Predicates = [HasAVX] in {
2166 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2167 SSEPackedSingle>, PS, VEX, VEX_WIG;
2168 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2169 SSEPackedDouble>, PD, VEX, VEX_WIG;
2170 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2171 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2172 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2173 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2175 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2176 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2177 (VMOVMSKPSrr VR128:$src)>;
2178 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2179 (VMOVMSKPDrr VR128:$src)>;
2180 def : Pat<(X86movmsk (v8i32 VR256:$src)),
2181 (VMOVMSKPSYrr VR256:$src)>;
2182 def : Pat<(X86movmsk (v4i64 VR256:$src)),
2183 (VMOVMSKPDYrr VR256:$src)>;
2186 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2187 SSEPackedSingle>, PS;
2188 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2189 SSEPackedDouble>, PD;
2191 let Predicates = [UseSSE2] in {
2192 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2193 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2194 (MOVMSKPSrr VR128:$src)>;
2195 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2196 (MOVMSKPDrr VR128:$src)>;
2199 //===---------------------------------------------------------------------===//
2200 // SSE2 - Packed Integer Logical Instructions
2201 //===---------------------------------------------------------------------===//
2203 let ExeDomain = SSEPackedInt in { // SSE integer instructions
2205 /// PDI_binop_rm - Simple SSE2 binary operator.
2206 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2207 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2208 X86MemOperand x86memop, X86FoldableSchedWrite sched,
2209 bit IsCommutable, bit Is2Addr> {
2210 let isCommutable = IsCommutable in
2211 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2212 (ins RC:$src1, RC:$src2),
2214 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2215 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2216 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2218 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2219 (ins RC:$src1, x86memop:$src2),
2221 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2222 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2223 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2224 Sched<[sched.Folded, sched.ReadAfterFold]>;
2226 } // ExeDomain = SSEPackedInt
2228 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2229 ValueType OpVT128, ValueType OpVT256,
2230 X86SchedWriteWidths sched, bit IsCommutable,
2232 let Predicates = [HasAVX, prd] in
2233 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2234 VR128, load, i128mem, sched.XMM,
2235 IsCommutable, 0>, VEX_4V, VEX_WIG;
2237 let Constraints = "$src1 = $dst" in
2238 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2239 memop, i128mem, sched.XMM, IsCommutable, 1>;
2241 let Predicates = [HasAVX2, prd] in
2242 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2243 OpVT256, VR256, load, i256mem, sched.YMM,
2244 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2247 // These are ordered here for pattern ordering requirements with the fp versions
2249 defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2250 SchedWriteVecLogic, 1, NoVLX>;
2251 defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2252 SchedWriteVecLogic, 1, NoVLX>;
2253 defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2254 SchedWriteVecLogic, 1, NoVLX>;
2255 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2256 SchedWriteVecLogic, 0, NoVLX>;
2258 //===----------------------------------------------------------------------===//
2259 // SSE 1 & 2 - Logical Instructions
2260 //===----------------------------------------------------------------------===//
2262 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2264 /// There are no patterns here because isel prefers integer versions for SSE2
2265 /// and later. There are SSE1 v4f32 patterns later.
2266 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2267 SDNode OpNode, X86SchedWriteWidths sched> {
2268 let Predicates = [HasAVX, NoVLX] in {
2269 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2270 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2271 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2273 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2274 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2275 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2277 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2278 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2279 [], [], 0>, PS, VEX_4V, VEX_WIG;
2281 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2282 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2283 [], [], 0>, PD, VEX_4V, VEX_WIG;
2286 let Constraints = "$src1 = $dst" in {
2287 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2288 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2291 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2292 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2297 defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2298 defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2299 defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2300 let isCommutable = 0 in
2301 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2303 let Predicates = [HasAVX2, NoVLX] in {
2304 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2305 (VPANDYrr VR256:$src1, VR256:$src2)>;
2306 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2307 (VPANDYrr VR256:$src1, VR256:$src2)>;
2308 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2309 (VPANDYrr VR256:$src1, VR256:$src2)>;
2311 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2312 (VPORYrr VR256:$src1, VR256:$src2)>;
2313 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2314 (VPORYrr VR256:$src1, VR256:$src2)>;
2315 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2316 (VPORYrr VR256:$src1, VR256:$src2)>;
2318 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2319 (VPXORYrr VR256:$src1, VR256:$src2)>;
2320 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2321 (VPXORYrr VR256:$src1, VR256:$src2)>;
2322 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2323 (VPXORYrr VR256:$src1, VR256:$src2)>;
2325 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2326 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2327 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2328 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2329 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2330 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2332 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2333 (VPANDYrm VR256:$src1, addr:$src2)>;
2334 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2335 (VPANDYrm VR256:$src1, addr:$src2)>;
2336 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2337 (VPANDYrm VR256:$src1, addr:$src2)>;
2339 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2340 (VPORYrm VR256:$src1, addr:$src2)>;
2341 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2342 (VPORYrm VR256:$src1, addr:$src2)>;
2343 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2344 (VPORYrm VR256:$src1, addr:$src2)>;
2346 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2347 (VPXORYrm VR256:$src1, addr:$src2)>;
2348 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2349 (VPXORYrm VR256:$src1, addr:$src2)>;
2350 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2351 (VPXORYrm VR256:$src1, addr:$src2)>;
2353 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2354 (VPANDNYrm VR256:$src1, addr:$src2)>;
2355 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2356 (VPANDNYrm VR256:$src1, addr:$src2)>;
2357 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2358 (VPANDNYrm VR256:$src1, addr:$src2)>;
2361 // If only AVX1 is supported, we need to handle integer operations with
2362 // floating point instructions since the integer versions aren't available.
2363 let Predicates = [HasAVX1Only] in {
2364 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2365 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2366 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2367 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2368 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2369 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2370 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2371 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2373 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2374 (VORPSYrr VR256:$src1, VR256:$src2)>;
2375 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2376 (VORPSYrr VR256:$src1, VR256:$src2)>;
2377 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2378 (VORPSYrr VR256:$src1, VR256:$src2)>;
2379 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2380 (VORPSYrr VR256:$src1, VR256:$src2)>;
2382 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2383 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2384 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2385 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2386 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2387 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2388 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2389 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2391 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2392 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2393 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2394 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2395 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2396 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2397 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2398 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2400 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2401 (VANDPSYrm VR256:$src1, addr:$src2)>;
2402 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2403 (VANDPSYrm VR256:$src1, addr:$src2)>;
2404 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2405 (VANDPSYrm VR256:$src1, addr:$src2)>;
2406 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2407 (VANDPSYrm VR256:$src1, addr:$src2)>;
2409 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2410 (VORPSYrm VR256:$src1, addr:$src2)>;
2411 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2412 (VORPSYrm VR256:$src1, addr:$src2)>;
2413 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2414 (VORPSYrm VR256:$src1, addr:$src2)>;
2415 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2416 (VORPSYrm VR256:$src1, addr:$src2)>;
2418 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2419 (VXORPSYrm VR256:$src1, addr:$src2)>;
2420 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2421 (VXORPSYrm VR256:$src1, addr:$src2)>;
2422 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2423 (VXORPSYrm VR256:$src1, addr:$src2)>;
2424 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2425 (VXORPSYrm VR256:$src1, addr:$src2)>;
2427 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2428 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2429 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2430 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2431 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2432 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2433 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2434 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2437 let Predicates = [HasAVX, NoVLX] in {
2438 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2439 (VPANDrr VR128:$src1, VR128:$src2)>;
2440 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2441 (VPANDrr VR128:$src1, VR128:$src2)>;
2442 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2443 (VPANDrr VR128:$src1, VR128:$src2)>;
2445 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2446 (VPORrr VR128:$src1, VR128:$src2)>;
2447 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2448 (VPORrr VR128:$src1, VR128:$src2)>;
2449 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2450 (VPORrr VR128:$src1, VR128:$src2)>;
2452 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2453 (VPXORrr VR128:$src1, VR128:$src2)>;
2454 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2455 (VPXORrr VR128:$src1, VR128:$src2)>;
2456 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2457 (VPXORrr VR128:$src1, VR128:$src2)>;
2459 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2460 (VPANDNrr VR128:$src1, VR128:$src2)>;
2461 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2462 (VPANDNrr VR128:$src1, VR128:$src2)>;
2463 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2464 (VPANDNrr VR128:$src1, VR128:$src2)>;
2466 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2467 (VPANDrm VR128:$src1, addr:$src2)>;
2468 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2469 (VPANDrm VR128:$src1, addr:$src2)>;
2470 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2471 (VPANDrm VR128:$src1, addr:$src2)>;
2473 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2474 (VPORrm VR128:$src1, addr:$src2)>;
2475 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2476 (VPORrm VR128:$src1, addr:$src2)>;
2477 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2478 (VPORrm VR128:$src1, addr:$src2)>;
2480 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2481 (VPXORrm VR128:$src1, addr:$src2)>;
2482 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2483 (VPXORrm VR128:$src1, addr:$src2)>;
2484 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2485 (VPXORrm VR128:$src1, addr:$src2)>;
2487 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2488 (VPANDNrm VR128:$src1, addr:$src2)>;
2489 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2490 (VPANDNrm VR128:$src1, addr:$src2)>;
2491 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2492 (VPANDNrm VR128:$src1, addr:$src2)>;
2495 let Predicates = [UseSSE2] in {
2496 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2497 (PANDrr VR128:$src1, VR128:$src2)>;
2498 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2499 (PANDrr VR128:$src1, VR128:$src2)>;
2500 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2501 (PANDrr VR128:$src1, VR128:$src2)>;
2503 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2504 (PORrr VR128:$src1, VR128:$src2)>;
2505 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2506 (PORrr VR128:$src1, VR128:$src2)>;
2507 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2508 (PORrr VR128:$src1, VR128:$src2)>;
2510 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2511 (PXORrr VR128:$src1, VR128:$src2)>;
2512 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2513 (PXORrr VR128:$src1, VR128:$src2)>;
2514 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2515 (PXORrr VR128:$src1, VR128:$src2)>;
2517 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2518 (PANDNrr VR128:$src1, VR128:$src2)>;
2519 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2520 (PANDNrr VR128:$src1, VR128:$src2)>;
2521 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2522 (PANDNrr VR128:$src1, VR128:$src2)>;
2524 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2525 (PANDrm VR128:$src1, addr:$src2)>;
2526 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2527 (PANDrm VR128:$src1, addr:$src2)>;
2528 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2529 (PANDrm VR128:$src1, addr:$src2)>;
2531 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2532 (PORrm VR128:$src1, addr:$src2)>;
2533 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2534 (PORrm VR128:$src1, addr:$src2)>;
2535 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2536 (PORrm VR128:$src1, addr:$src2)>;
2538 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2539 (PXORrm VR128:$src1, addr:$src2)>;
2540 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2541 (PXORrm VR128:$src1, addr:$src2)>;
2542 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2543 (PXORrm VR128:$src1, addr:$src2)>;
2545 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2546 (PANDNrm VR128:$src1, addr:$src2)>;
2547 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2548 (PANDNrm VR128:$src1, addr:$src2)>;
2549 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2550 (PANDNrm VR128:$src1, addr:$src2)>;
2553 // Patterns for packed operations when we don't have integer type available.
2554 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2555 (ANDPSrr VR128:$src1, VR128:$src2)>;
2556 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2557 (ORPSrr VR128:$src1, VR128:$src2)>;
2558 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2559 (XORPSrr VR128:$src1, VR128:$src2)>;
2560 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2561 (ANDNPSrr VR128:$src1, VR128:$src2)>;
2563 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2564 (ANDPSrm VR128:$src1, addr:$src2)>;
2565 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2566 (ORPSrm VR128:$src1, addr:$src2)>;
2567 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2568 (XORPSrm VR128:$src1, addr:$src2)>;
2569 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2570 (ANDNPSrm VR128:$src1, addr:$src2)>;
2572 //===----------------------------------------------------------------------===//
2573 // SSE 1 & 2 - Arithmetic Instructions
2574 //===----------------------------------------------------------------------===//
2576 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2579 /// In addition, we also have a special variant of the scalar form here to
2580 /// represent the associated intrinsic operation. This form is unlike the
2581 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
2582 /// and leaves the top elements unmodified (therefore these cannot be commuted).
2584 /// These three forms can each be reg+reg or reg+mem.
2587 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2589 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2590 SDNode OpNode, X86SchedWriteSizes sched> {
2591 let Uses = [MXCSR], mayRaiseFPException = 1 in {
2592 let Predicates = [HasAVX, NoVLX] in {
2593 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2594 VR128, v4f32, f128mem, loadv4f32,
2595 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2596 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2597 VR128, v2f64, f128mem, loadv2f64,
2598 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2600 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2601 OpNode, VR256, v8f32, f256mem, loadv8f32,
2602 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2603 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2604 OpNode, VR256, v4f64, f256mem, loadv4f64,
2605 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2608 let Constraints = "$src1 = $dst" in {
2609 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2610 v4f32, f128mem, memopv4f32, SSEPackedSingle,
2612 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2613 v2f64, f128mem, memopv2f64, SSEPackedDouble,
2619 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2620 X86SchedWriteSizes sched> {
2621 let Uses = [MXCSR], mayRaiseFPException = 1 in {
2622 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2623 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2624 XS, VEX_4V, VEX_LIG, VEX_WIG;
2625 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2626 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2627 XD, VEX_4V, VEX_LIG, VEX_WIG;
2629 let Constraints = "$src1 = $dst" in {
2630 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2631 OpNode, FR32, f32mem, SSEPackedSingle,
2633 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2634 OpNode, FR64, f64mem, SSEPackedDouble,
2640 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2641 SDPatternOperator OpNode,
2642 X86SchedWriteSizes sched> {
2643 let Uses = [MXCSR], mayRaiseFPException = 1 in {
2644 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2645 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2646 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2647 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2648 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2649 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2651 let Constraints = "$src1 = $dst" in {
2652 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2653 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2654 SSEPackedSingle, sched.PS.Scl>, XS;
2655 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2656 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2657 SSEPackedDouble, sched.PD.Scl>, XD;
2662 // Binary Arithmetic instructions
2663 defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2664 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2665 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2666 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2667 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2668 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2669 let isCommutable = 0 in {
2670 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2671 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2672 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2673 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2674 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2675 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2676 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2677 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2678 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2679 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2680 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2681 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2684 let isCodeGenOnly = 1 in {
2685 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2686 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2687 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2688 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2691 // Patterns used to select SSE scalar fp arithmetic instructions from
2694 // (1) a scalar fp operation followed by a blend
2696 // The effect is that the backend no longer emits unnecessary vector
2697 // insert instructions immediately after SSE scalar fp instructions
2698 // like addss or mulss.
2700 // For example, given the following code:
2701 // __m128 foo(__m128 A, __m128 B) {
2706 // Previously we generated:
2707 // addss %xmm0, %xmm1
2708 // movss %xmm1, %xmm0
2711 // addss %xmm1, %xmm0
2713 // (2) a vector packed single/double fp operation followed by a vector insert
2715 // The effect is that the backend converts the packed fp instruction
2716 // followed by a vector insert into a single SSE scalar fp instruction.
2718 // For example, given the following code:
2719 // __m128 foo(__m128 A, __m128 B) {
2720 // __m128 C = A + B;
2721 // return (__m128) {c[0], a[1], a[2], a[3]};
2724 // Previously we generated:
2725 // addps %xmm0, %xmm1
2726 // movss %xmm1, %xmm0
2729 // addss %xmm1, %xmm0
2731 // TODO: Some canonicalization in lowering would simplify the number of
2732 // patterns we have to try to match.
2733 multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2734 ValueType VT, ValueType EltTy,
2735 RegisterClass RC, PatFrag ld_frag,
2736 Predicate BasePredicate> {
2737 let Predicates = [BasePredicate] in {
2738 // extracted scalar math op with insert via movss/movsd
2739 def : Pat<(VT (Move (VT VR128:$dst),
2740 (VT (scalar_to_vector
2741 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2743 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2744 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2745 def : Pat<(VT (Move (VT VR128:$dst),
2746 (VT (scalar_to_vector
2747 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2748 (ld_frag addr:$src)))))),
2749 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2752 // Repeat for AVX versions of the instructions.
2753 let Predicates = [UseAVX] in {
2754 // extracted scalar math op with insert via movss/movsd
2755 def : Pat<(VT (Move (VT VR128:$dst),
2756 (VT (scalar_to_vector
2757 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2759 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2760 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2761 def : Pat<(VT (Move (VT VR128:$dst),
2762 (VT (scalar_to_vector
2763 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2764 (ld_frag addr:$src)))))),
2765 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2769 defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2770 defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2771 defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2772 defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2774 defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2775 defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2776 defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2777 defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2780 /// In addition, we also have a special variant of the scalar form here to
2781 /// represent the associated intrinsic operation. This form is unlike the
2782 /// plain scalar form, in that it takes an entire vector (instead of a
2783 /// scalar) and leaves the top elements undefined.
2785 /// And, we have a special variant form for a full-vector intrinsic form.
2787 /// sse_fp_unop_s - SSE1 unops in scalar form
2788 /// For the non-AVX defs, we need $src1 to be tied to $dst because
2789 /// the HW instructions are 2 operand / destructive.
2790 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2791 ValueType ScalarVT, X86MemOperand x86memop,
2792 Operand intmemop, SDNode OpNode, Domain d,
2793 X86FoldableSchedWrite sched, Predicate target> {
2794 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2795 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2796 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2797 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2800 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2801 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2802 [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2803 Sched<[sched.Folded]>,
2804 Requires<[target, OptForSize]>;
2807 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2808 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2809 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2812 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2813 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2814 Sched<[sched.Folded, sched.ReadAfterFold]>;
2819 multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2820 PatFrags mem_frags, Intrinsic Intr,
2821 Predicate target, string Suffix> {
2822 let Predicates = [target] in {
2823 // These are unary operations, but they are modeled as having 2 source operands
2824 // because the high elements of the destination are unchanged in SSE.
2825 def : Pat<(Intr VR128:$src),
2826 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2828 // We don't want to fold scalar loads into these instructions unless
2829 // optimizing for size. This is because the folded instruction will have a
2830 // partial register update, while the unfolded sequence will not, e.g.
2832 // rcpss %xmm0, %xmm0
2833 // which has a clobber before the rcp, vs.
2835 let Predicates = [target, OptForSize] in {
2836 def : Pat<(Intr (mem_frags addr:$src2)),
2837 (!cast<Instruction>(NAME#m_Int)
2838 (vt (IMPLICIT_DEF)), addr:$src2)>;
2842 multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, PatFrags mem_frags,
2843 Intrinsic Intr, Predicate target> {
2844 let Predicates = [target] in {
2845 def : Pat<(Intr VR128:$src),
2846 (!cast<Instruction>(NAME#r_Int) VR128:$src,
2849 let Predicates = [target, OptForSize] in {
2850 def : Pat<(Intr (mem_frags addr:$src2)),
2851 (!cast<Instruction>(NAME#m_Int)
2852 (vt (IMPLICIT_DEF)), addr:$src2)>;
2856 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2857 ValueType ScalarVT, X86MemOperand x86memop,
2858 Operand intmemop, SDNode OpNode, Domain d,
2859 X86FoldableSchedWrite sched, Predicate target> {
2860 let isCodeGenOnly = 1, hasSideEffects = 0 in {
2861 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2862 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2863 [], d>, Sched<[sched]>;
2865 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2866 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2867 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2869 let hasSideEffects = 0, ExeDomain = d in {
2870 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2871 (ins VR128:$src1, VR128:$src2),
2872 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2873 []>, Sched<[sched]>;
2875 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2876 (ins VR128:$src1, intmemop:$src2),
2877 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2878 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2881 // We don't want to fold scalar loads into these instructions unless
2882 // optimizing for size. This is because the folded instruction will have a
2883 // partial register update, while the unfolded sequence will not, e.g.
2884 // vmovss mem, %xmm0
2885 // vrcpss %xmm0, %xmm0, %xmm0
2886 // which has a clobber before the rcp, vs.
2887 // vrcpss mem, %xmm0, %xmm0
2888 // TODO: In theory, we could fold the load, and avoid the stall caused by
2889 // the partial register store, either in BreakFalseDeps or with smarter RA.
2890 let Predicates = [target] in {
2891 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
2892 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2894 let Predicates = [target, OptForSize] in {
2895 def : Pat<(ScalarVT (OpNode (load addr:$src))),
2896 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2901 /// sse1_fp_unop_p - SSE1 unops in packed form.
2902 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2903 X86SchedWriteWidths sched, list<Predicate> prds> {
2904 let Predicates = prds in {
2905 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2906 !strconcat("v", OpcodeStr,
2907 "ps\t{$src, $dst|$dst, $src}"),
2908 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2909 VEX, Sched<[sched.XMM]>, VEX_WIG;
2910 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2911 !strconcat("v", OpcodeStr,
2912 "ps\t{$src, $dst|$dst, $src}"),
2913 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2914 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2915 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2916 !strconcat("v", OpcodeStr,
2917 "ps\t{$src, $dst|$dst, $src}"),
2918 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2919 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2920 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2921 !strconcat("v", OpcodeStr,
2922 "ps\t{$src, $dst|$dst, $src}"),
2923 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2924 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2927 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2928 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2929 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2931 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2932 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2933 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2934 Sched<[sched.XMM.Folded]>;
2937 /// sse2_fp_unop_p - SSE2 unops in vector forms.
2938 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2939 SDNode OpNode, X86SchedWriteWidths sched> {
2940 let Predicates = [HasAVX, NoVLX] in {
2941 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2942 !strconcat("v", OpcodeStr,
2943 "pd\t{$src, $dst|$dst, $src}"),
2944 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2945 VEX, Sched<[sched.XMM]>, VEX_WIG;
2946 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2947 !strconcat("v", OpcodeStr,
2948 "pd\t{$src, $dst|$dst, $src}"),
2949 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2950 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2951 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2952 !strconcat("v", OpcodeStr,
2953 "pd\t{$src, $dst|$dst, $src}"),
2954 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2955 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2956 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2957 !strconcat("v", OpcodeStr,
2958 "pd\t{$src, $dst|$dst, $src}"),
2959 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2960 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2963 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2964 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2965 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2967 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2968 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2969 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
2970 Sched<[sched.XMM.Folded]>;
2973 multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
2974 X86SchedWriteWidths sched, Predicate AVXTarget> {
2975 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2976 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
2978 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2979 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
2981 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
2984 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2985 X86SchedWriteWidths sched, Predicate AVXTarget> {
2986 defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem,
2987 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
2988 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
2989 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
2990 XS, VEX_4V, VEX_LIG, VEX_WIG;
2993 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2994 X86SchedWriteWidths sched, Predicate AVXTarget> {
2995 defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem,
2996 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
2997 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
2998 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
2999 XD, VEX_4V, VEX_LIG, VEX_WIG;
3003 defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
3004 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3005 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3006 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3008 // Reciprocal approximations. Note that these typically require refinement
3009 // in order to obtain suitable precision.
3010 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3011 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3012 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3013 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3014 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3015 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3017 // There is no f64 version of the reciprocal approximation instructions.
3019 multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
3020 ValueType VT, Predicate BasePredicate> {
3021 let Predicates = [BasePredicate] in {
3022 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3023 (OpNode (extractelt VT:$src, 0))))),
3024 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3027 // Repeat for AVX versions of the instructions.
3028 let Predicates = [UseAVX] in {
3029 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3030 (OpNode (extractelt VT:$src, 0))))),
3031 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3035 defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3036 defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3038 multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3039 SDNode Move, ValueType VT,
3040 Predicate BasePredicate> {
3041 let Predicates = [BasePredicate] in {
3042 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3043 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3046 // Repeat for AVX versions of the instructions.
3047 let Predicates = [HasAVX] in {
3048 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3049 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3053 defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3055 defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3059 //===----------------------------------------------------------------------===//
3060 // SSE 1 & 2 - Non-temporal stores
3061 //===----------------------------------------------------------------------===//
3063 let AddedComplexity = 400 in { // Prefer non-temporal versions
3064 let Predicates = [HasAVX, NoVLX] in {
3065 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3066 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3067 (ins f128mem:$dst, VR128:$src),
3068 "movntps\t{$src, $dst|$dst, $src}",
3069 [(alignednontemporalstore (v4f32 VR128:$src),
3070 addr:$dst)]>, VEX, VEX_WIG;
3071 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3072 (ins f128mem:$dst, VR128:$src),
3073 "movntpd\t{$src, $dst|$dst, $src}",
3074 [(alignednontemporalstore (v2f64 VR128:$src),
3075 addr:$dst)]>, VEX, VEX_WIG;
3078 let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3079 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3080 (ins f256mem:$dst, VR256:$src),
3081 "movntps\t{$src, $dst|$dst, $src}",
3082 [(alignednontemporalstore (v8f32 VR256:$src),
3083 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3084 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3085 (ins f256mem:$dst, VR256:$src),
3086 "movntpd\t{$src, $dst|$dst, $src}",
3087 [(alignednontemporalstore (v4f64 VR256:$src),
3088 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3091 let ExeDomain = SSEPackedInt in {
3092 def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
3093 (ins i128mem:$dst, VR128:$src),
3094 "movntdq\t{$src, $dst|$dst, $src}",
3095 [(alignednontemporalstore (v2i64 VR128:$src),
3096 addr:$dst)]>, VEX, VEX_WIG,
3097 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3098 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3099 (ins i256mem:$dst, VR256:$src),
3100 "movntdq\t{$src, $dst|$dst, $src}",
3101 [(alignednontemporalstore (v4i64 VR256:$src),
3102 addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3103 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3107 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3108 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3109 "movntps\t{$src, $dst|$dst, $src}",
3110 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3111 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3112 "movntpd\t{$src, $dst|$dst, $src}",
3113 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3116 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3117 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3118 "movntdq\t{$src, $dst|$dst, $src}",
3119 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3121 let SchedRW = [WriteStoreNT] in {
3122 // There is no AVX form for instructions below this point
3123 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3124 "movnti{l}\t{$src, $dst|$dst, $src}",
3125 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3126 PS, Requires<[HasSSE2]>;
3127 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3128 "movnti{q}\t{$src, $dst|$dst, $src}",
3129 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3130 PS, Requires<[HasSSE2]>;
3131 } // SchedRW = [WriteStoreNT]
3133 let Predicates = [HasAVX, NoVLX] in {
3134 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3135 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3136 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3137 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3138 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3139 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3141 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3142 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3143 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3144 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3145 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3146 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3149 let Predicates = [UseSSE2] in {
3150 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3151 (MOVNTDQmr addr:$dst, VR128:$src)>;
3152 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3153 (MOVNTDQmr addr:$dst, VR128:$src)>;
3154 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3155 (MOVNTDQmr addr:$dst, VR128:$src)>;
3158 } // AddedComplexity
3160 //===----------------------------------------------------------------------===//
3161 // SSE 1 & 2 - Prefetch and memory fence
3162 //===----------------------------------------------------------------------===//
3164 // Prefetch intrinsic.
3165 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3166 def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3167 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3168 def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3169 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3170 def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3171 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3172 def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3173 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3176 // FIXME: How should flush instruction be modeled?
3177 let SchedRW = [WriteLoad] in {
3179 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3180 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3181 PS, Requires<[HasSSE2]>;
3184 let SchedRW = [WriteNop] in {
3185 // Pause. This "instruction" is encoded as "rep; nop", so even though it
3186 // was introduced with SSE2, it's backward compatible.
3187 def PAUSE : I<0x90, RawFrm, (outs), (ins),
3188 "pause", [(int_x86_sse2_pause)]>, OBXS;
3191 let SchedRW = [WriteFence] in {
3192 // Load, store, and memory fence
3193 // TODO: As with mfence, we may want to ease the availability of sfence/lfence
3194 // to include any 64-bit target.
3195 def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3196 PS, Requires<[HasSSE1]>;
3197 def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3198 PS, Requires<[HasSSE2]>;
3199 def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3200 PS, Requires<[HasMFence]>;
3203 def : Pat<(X86MFence), (MFENCE)>;
3205 //===----------------------------------------------------------------------===//
3206 // SSE 1 & 2 - Load/Store XCSR register
3207 //===----------------------------------------------------------------------===//
3209 let mayLoad=1, hasSideEffects=1 in
3210 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3211 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3212 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3213 let mayStore=1, hasSideEffects=1 in
3214 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3215 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3216 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3218 let mayLoad=1, hasSideEffects=1 in
3219 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3220 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3221 PS, Sched<[WriteLDMXCSR]>;
3222 let mayStore=1, hasSideEffects=1 in
3223 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3224 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3225 PS, Sched<[WriteSTMXCSR]>;
3227 //===---------------------------------------------------------------------===//
3228 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3229 //===---------------------------------------------------------------------===//
3231 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3233 let hasSideEffects = 0 in {
3234 def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3235 "movdqa\t{$src, $dst|$dst, $src}", []>,
3236 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3237 def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3238 "movdqu\t{$src, $dst|$dst, $src}", []>,
3239 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3240 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3241 "movdqa\t{$src, $dst|$dst, $src}", []>,
3242 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3243 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3244 "movdqu\t{$src, $dst|$dst, $src}", []>,
3245 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3249 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3250 def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3251 "movdqa\t{$src, $dst|$dst, $src}", []>,
3252 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3253 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3254 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3255 "movdqa\t{$src, $dst|$dst, $src}", []>,
3256 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3257 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3258 def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3259 "movdqu\t{$src, $dst|$dst, $src}", []>,
3260 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3261 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3262 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3263 "movdqu\t{$src, $dst|$dst, $src}", []>,
3264 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3265 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3268 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3269 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3270 def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3271 "movdqa\t{$src, $dst|$dst, $src}",
3272 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3273 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3274 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3275 "movdqa\t{$src, $dst|$dst, $src}", []>,
3276 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3277 VEX, VEX_L, VEX_WIG;
3278 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3279 "vmovdqu\t{$src, $dst|$dst, $src}",
3280 [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3281 Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3283 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3284 "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3285 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3286 XS, VEX, VEX_L, VEX_WIG;
3289 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3290 def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
3291 (ins i128mem:$dst, VR128:$src),
3292 "movdqa\t{$src, $dst|$dst, $src}",
3293 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3294 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3295 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3296 (ins i256mem:$dst, VR256:$src),
3297 "movdqa\t{$src, $dst|$dst, $src}", []>,
3298 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3299 def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3300 "vmovdqu\t{$src, $dst|$dst, $src}",
3301 [(store (v2i64 VR128:$src), addr:$dst)]>,
3302 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3303 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3304 "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3305 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3308 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3309 let hasSideEffects = 0 in {
3310 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3311 "movdqa\t{$src, $dst|$dst, $src}", []>;
3313 def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3314 "movdqu\t{$src, $dst|$dst, $src}", []>,
3315 XS, Requires<[UseSSE2]>;
3319 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3320 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3321 "movdqa\t{$src, $dst|$dst, $src}", []>,
3322 FoldGenData<"MOVDQArr">;
3324 def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3325 "movdqu\t{$src, $dst|$dst, $src}", []>,
3326 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3330 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3331 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3332 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3333 "movdqa\t{$src, $dst|$dst, $src}",
3334 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3335 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3336 "movdqu\t{$src, $dst|$dst, $src}",
3337 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3338 XS, Requires<[UseSSE2]>;
3341 let mayStore = 1, hasSideEffects = 0,
3342 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3343 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3344 "movdqa\t{$src, $dst|$dst, $src}",
3345 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3346 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3347 "movdqu\t{$src, $dst|$dst, $src}",
3348 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3349 XS, Requires<[UseSSE2]>;
3352 } // ExeDomain = SSEPackedInt
3354 // Reversed version with ".s" suffix for GAS compatibility.
3355 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3356 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3357 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3358 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3359 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3360 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3361 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3362 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3364 // Reversed version with ".s" suffix for GAS compatibility.
3365 def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3366 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3367 def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3368 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3370 let Predicates = [HasAVX, NoVLX] in {
3371 // Additional patterns for other integer sizes.
3372 def : Pat<(alignedloadv4i32 addr:$src),
3373 (VMOVDQArm addr:$src)>;
3374 def : Pat<(alignedloadv8i16 addr:$src),
3375 (VMOVDQArm addr:$src)>;
3376 def : Pat<(alignedloadv16i8 addr:$src),
3377 (VMOVDQArm addr:$src)>;
3378 def : Pat<(loadv4i32 addr:$src),
3379 (VMOVDQUrm addr:$src)>;
3380 def : Pat<(loadv8i16 addr:$src),
3381 (VMOVDQUrm addr:$src)>;
3382 def : Pat<(loadv16i8 addr:$src),
3383 (VMOVDQUrm addr:$src)>;
3385 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3386 (VMOVDQAmr addr:$dst, VR128:$src)>;
3387 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3388 (VMOVDQAmr addr:$dst, VR128:$src)>;
3389 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3390 (VMOVDQAmr addr:$dst, VR128:$src)>;
3391 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3392 (VMOVDQUmr addr:$dst, VR128:$src)>;
3393 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3394 (VMOVDQUmr addr:$dst, VR128:$src)>;
3395 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3396 (VMOVDQUmr addr:$dst, VR128:$src)>;
3399 //===---------------------------------------------------------------------===//
3400 // SSE2 - Packed Integer Arithmetic Instructions
3401 //===---------------------------------------------------------------------===//
3403 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3405 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3406 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3407 ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3408 PatFrag memop_frag, X86MemOperand x86memop,
3409 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3410 let isCommutable = 1 in
3411 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3412 (ins RC:$src1, RC:$src2),
3414 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3415 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3416 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3418 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3419 (ins RC:$src1, x86memop:$src2),
3421 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3422 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3423 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3424 (memop_frag addr:$src2))))]>,
3425 Sched<[sched.Folded, sched.ReadAfterFold]>;
3427 } // ExeDomain = SSEPackedInt
3429 defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3430 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3431 defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3432 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3433 defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3434 SchedWriteVecALU, 1, NoVLX>;
3435 defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3436 SchedWriteVecALU, 1, NoVLX>;
3437 defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3438 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3439 defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3440 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3441 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3442 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3443 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3444 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3445 defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3446 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3447 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3448 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3449 defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3450 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3451 defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3452 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3453 defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3454 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3455 defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3456 SchedWriteVecALU, 0, NoVLX>;
3457 defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3458 SchedWriteVecALU, 0, NoVLX>;
3459 defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3460 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3461 defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3462 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3463 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3464 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3465 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3466 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3467 defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3468 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3469 defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3470 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3471 defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3472 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3473 defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3474 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3475 defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3476 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3477 defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3478 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3479 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3480 SchedWriteVecIMul, 1, NoVLX>;
3482 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3483 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3484 load, i128mem, SchedWriteVecIMul.XMM, 0>,
3487 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3488 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3489 VR256, load, i256mem, SchedWriteVecIMul.YMM,
3490 0>, VEX_4V, VEX_L, VEX_WIG;
3491 let Constraints = "$src1 = $dst" in
3492 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3493 memop, i128mem, SchedWriteVecIMul.XMM>;
3495 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3496 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3497 load, i128mem, SchedWritePSADBW.XMM, 0>,
3499 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3500 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3501 load, i256mem, SchedWritePSADBW.YMM, 0>,
3502 VEX_4V, VEX_L, VEX_WIG;
3503 let Constraints = "$src1 = $dst" in
3504 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3505 memop, i128mem, SchedWritePSADBW.XMM>;
3507 //===---------------------------------------------------------------------===//
3508 // SSE2 - Packed Integer Logical Instructions
3509 //===---------------------------------------------------------------------===//
3511 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3512 string OpcodeStr, SDNode OpNode,
3513 SDNode OpNode2, RegisterClass RC,
3514 X86FoldableSchedWrite sched,
3515 X86FoldableSchedWrite schedImm,
3516 ValueType DstVT, ValueType SrcVT,
3517 PatFrag ld_frag, bit Is2Addr = 1> {
3518 // src2 is always 128-bit
3519 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3520 (ins RC:$src1, VR128:$src2),
3522 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3523 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3524 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3526 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3527 (ins RC:$src1, i128mem:$src2),
3529 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3530 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3531 [(set RC:$dst, (DstVT (OpNode RC:$src1,
3532 (SrcVT (ld_frag addr:$src2)))))]>,
3533 Sched<[sched.Folded, sched.ReadAfterFold]>;
3534 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3535 (ins RC:$src1, u8imm:$src2),
3537 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3538 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3539 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3543 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3544 string OpcodeStr, SDNode OpNode,
3545 SDNode OpNode2, ValueType DstVT128,
3546 ValueType DstVT256, ValueType SrcVT,
3547 X86SchedWriteWidths sched,
3548 X86SchedWriteWidths schedImm, Predicate prd> {
3549 let Predicates = [HasAVX, prd] in
3550 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3551 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3552 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3553 let Predicates = [HasAVX2, prd] in
3554 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3555 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3556 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3558 let Constraints = "$src1 = $dst" in
3559 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3560 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3564 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3565 SDNode OpNode, RegisterClass RC, ValueType VT,
3566 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3567 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3569 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3570 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3571 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3575 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3576 SDNode OpNode, X86SchedWriteWidths sched> {
3577 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3578 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3579 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3580 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3581 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3582 VR256, v32i8, sched.YMM, 0>,
3583 VEX_4V, VEX_L, VEX_WIG;
3584 let Constraints = "$src1 = $dst" in
3585 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3589 let ExeDomain = SSEPackedInt in {
3590 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3591 v8i16, v16i16, v8i16, SchedWriteVecShift,
3592 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3593 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3594 v4i32, v8i32, v4i32, SchedWriteVecShift,
3595 SchedWriteVecShiftImm, NoVLX>;
3596 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3597 v2i64, v4i64, v2i64, SchedWriteVecShift,
3598 SchedWriteVecShiftImm, NoVLX>;
3600 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3601 v8i16, v16i16, v8i16, SchedWriteVecShift,
3602 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3603 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3604 v4i32, v8i32, v4i32, SchedWriteVecShift,
3605 SchedWriteVecShiftImm, NoVLX>;
3606 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3607 v2i64, v4i64, v2i64, SchedWriteVecShift,
3608 SchedWriteVecShiftImm, NoVLX>;
3610 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3611 v8i16, v16i16, v8i16, SchedWriteVecShift,
3612 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3613 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3614 v4i32, v8i32, v4i32, SchedWriteVecShift,
3615 SchedWriteVecShiftImm, NoVLX>;
3617 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3619 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3621 } // ExeDomain = SSEPackedInt
3623 //===---------------------------------------------------------------------===//
3624 // SSE2 - Packed Integer Comparison Instructions
3625 //===---------------------------------------------------------------------===//
3627 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3628 SchedWriteVecALU, 1, TruePredicate>;
3629 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3630 SchedWriteVecALU, 1, TruePredicate>;
3631 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3632 SchedWriteVecALU, 1, TruePredicate>;
3633 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3634 SchedWriteVecALU, 0, TruePredicate>;
3635 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3636 SchedWriteVecALU, 0, TruePredicate>;
3637 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3638 SchedWriteVecALU, 0, TruePredicate>;
3640 //===---------------------------------------------------------------------===//
3641 // SSE2 - Packed Integer Shuffle Instructions
3642 //===---------------------------------------------------------------------===//
3644 let ExeDomain = SSEPackedInt in {
3645 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3646 SDNode OpNode, X86SchedWriteWidths sched,
3648 let Predicates = [HasAVX, prd] in {
3649 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3650 (ins VR128:$src1, u8imm:$src2),
3651 !strconcat("v", OpcodeStr,
3652 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3654 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3655 VEX, Sched<[sched.XMM]>, VEX_WIG;
3656 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3657 (ins i128mem:$src1, u8imm:$src2),
3658 !strconcat("v", OpcodeStr,
3659 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3661 (vt128 (OpNode (load addr:$src1),
3662 (i8 timm:$src2))))]>, VEX,
3663 Sched<[sched.XMM.Folded]>, VEX_WIG;
3666 let Predicates = [HasAVX2, prd] in {
3667 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3668 (ins VR256:$src1, u8imm:$src2),
3669 !strconcat("v", OpcodeStr,
3670 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3672 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3673 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3674 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3675 (ins i256mem:$src1, u8imm:$src2),
3676 !strconcat("v", OpcodeStr,
3677 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3679 (vt256 (OpNode (load addr:$src1),
3680 (i8 timm:$src2))))]>, VEX, VEX_L,
3681 Sched<[sched.YMM.Folded]>, VEX_WIG;
3684 let Predicates = [UseSSE2] in {
3685 def ri : Ii8<0x70, MRMSrcReg,
3686 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3687 !strconcat(OpcodeStr,
3688 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3690 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3692 def mi : Ii8<0x70, MRMSrcMem,
3693 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3694 !strconcat(OpcodeStr,
3695 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3697 (vt128 (OpNode (memop addr:$src1),
3698 (i8 timm:$src2))))]>,
3699 Sched<[sched.XMM.Folded]>;
3702 } // ExeDomain = SSEPackedInt
3704 defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3705 SchedWriteShuffle, NoVLX>, PD;
3706 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3707 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3708 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3709 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3711 //===---------------------------------------------------------------------===//
3712 // Packed Integer Pack Instructions (SSE & AVX)
3713 //===---------------------------------------------------------------------===//
3715 let ExeDomain = SSEPackedInt in {
3716 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3717 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3718 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3719 PatFrag ld_frag, bit Is2Addr = 1> {
3720 def rr : PDI<opc, MRMSrcReg,
3721 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3723 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3724 !strconcat(OpcodeStr,
3725 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3727 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3729 def rm : PDI<opc, MRMSrcMem,
3730 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3732 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3733 !strconcat(OpcodeStr,
3734 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3736 (OutVT (OpNode (ArgVT RC:$src1),
3737 (ld_frag addr:$src2))))]>,
3738 Sched<[sched.Folded, sched.ReadAfterFold]>;
3741 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3742 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3743 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3744 PatFrag ld_frag, bit Is2Addr = 1> {
3745 def rr : SS48I<opc, MRMSrcReg,
3746 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3748 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3749 !strconcat(OpcodeStr,
3750 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3752 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3754 def rm : SS48I<opc, MRMSrcMem,
3755 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3757 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3758 !strconcat(OpcodeStr,
3759 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3761 (OutVT (OpNode (ArgVT RC:$src1),
3762 (ld_frag addr:$src2))))]>,
3763 Sched<[sched.Folded, sched.ReadAfterFold]>;
3766 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3767 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3768 i128mem, SchedWriteShuffle.XMM, load, 0>,
3770 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3771 i128mem, SchedWriteShuffle.XMM, load, 0>,
3774 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3775 i128mem, SchedWriteShuffle.XMM, load, 0>,
3777 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3778 i128mem, SchedWriteShuffle.XMM, load, 0>,
3782 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3783 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3784 i256mem, SchedWriteShuffle.YMM, load, 0>,
3785 VEX_4V, VEX_L, VEX_WIG;
3786 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3787 i256mem, SchedWriteShuffle.YMM, load, 0>,
3788 VEX_4V, VEX_L, VEX_WIG;
3790 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3791 i256mem, SchedWriteShuffle.YMM, load, 0>,
3792 VEX_4V, VEX_L, VEX_WIG;
3793 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3794 i256mem, SchedWriteShuffle.YMM, load, 0>,
3798 let Constraints = "$src1 = $dst" in {
3799 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3800 i128mem, SchedWriteShuffle.XMM, memop>;
3801 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3802 i128mem, SchedWriteShuffle.XMM, memop>;
3804 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3805 i128mem, SchedWriteShuffle.XMM, memop>;
3807 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3808 i128mem, SchedWriteShuffle.XMM, memop>;
3810 } // ExeDomain = SSEPackedInt
3812 //===---------------------------------------------------------------------===//
3813 // SSE2 - Packed Integer Unpack Instructions
3814 //===---------------------------------------------------------------------===//
3816 let ExeDomain = SSEPackedInt in {
3817 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3818 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3819 X86FoldableSchedWrite sched, PatFrag ld_frag,
3821 def rr : PDI<opc, MRMSrcReg,
3822 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3824 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3825 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3826 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3828 def rm : PDI<opc, MRMSrcMem,
3829 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3831 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3832 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3833 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3834 Sched<[sched.Folded, sched.ReadAfterFold]>;
3837 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3838 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3839 i128mem, SchedWriteShuffle.XMM, load, 0>,
3841 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3842 i128mem, SchedWriteShuffle.XMM, load, 0>,
3844 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3845 i128mem, SchedWriteShuffle.XMM, load, 0>,
3847 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3848 i128mem, SchedWriteShuffle.XMM, load, 0>,
3852 let Predicates = [HasAVX, NoVLX] in {
3853 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3854 i128mem, SchedWriteShuffle.XMM, load, 0>,
3856 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3857 i128mem, SchedWriteShuffle.XMM, load, 0>,
3859 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3860 i128mem, SchedWriteShuffle.XMM, load, 0>,
3862 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3863 i128mem, SchedWriteShuffle.XMM, load, 0>,
3867 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3868 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3869 i256mem, SchedWriteShuffle.YMM, load, 0>,
3870 VEX_4V, VEX_L, VEX_WIG;
3871 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3872 i256mem, SchedWriteShuffle.YMM, load, 0>,
3873 VEX_4V, VEX_L, VEX_WIG;
3874 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3875 i256mem, SchedWriteShuffle.YMM, load, 0>,
3876 VEX_4V, VEX_L, VEX_WIG;
3877 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3878 i256mem, SchedWriteShuffle.YMM, load, 0>,
3879 VEX_4V, VEX_L, VEX_WIG;
3882 let Predicates = [HasAVX2, NoVLX] in {
3883 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3884 i256mem, SchedWriteShuffle.YMM, load, 0>,
3885 VEX_4V, VEX_L, VEX_WIG;
3886 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3887 i256mem, SchedWriteShuffle.YMM, load, 0>,
3888 VEX_4V, VEX_L, VEX_WIG;
3889 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3890 i256mem, SchedWriteShuffle.YMM, load, 0>,
3891 VEX_4V, VEX_L, VEX_WIG;
3892 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3893 i256mem, SchedWriteShuffle.YMM, load, 0>,
3894 VEX_4V, VEX_L, VEX_WIG;
3897 let Constraints = "$src1 = $dst" in {
3898 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3899 i128mem, SchedWriteShuffle.XMM, memop>;
3900 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3901 i128mem, SchedWriteShuffle.XMM, memop>;
3902 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3903 i128mem, SchedWriteShuffle.XMM, memop>;
3904 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3905 i128mem, SchedWriteShuffle.XMM, memop>;
3907 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3908 i128mem, SchedWriteShuffle.XMM, memop>;
3909 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3910 i128mem, SchedWriteShuffle.XMM, memop>;
3911 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3912 i128mem, SchedWriteShuffle.XMM, memop>;
3913 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3914 i128mem, SchedWriteShuffle.XMM, memop>;
3916 } // ExeDomain = SSEPackedInt
3918 //===---------------------------------------------------------------------===//
3919 // SSE2 - Packed Integer Extract and Insert
3920 //===---------------------------------------------------------------------===//
3922 let ExeDomain = SSEPackedInt in {
3923 multiclass sse2_pinsrw<bit Is2Addr = 1> {
3924 def rr : Ii8<0xC4, MRMSrcReg,
3925 (outs VR128:$dst), (ins VR128:$src1,
3926 GR32orGR64:$src2, u8imm:$src3),
3928 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3929 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3931 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
3932 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3933 def rm : Ii8<0xC4, MRMSrcMem,
3934 (outs VR128:$dst), (ins VR128:$src1,
3935 i16mem:$src2, u8imm:$src3),
3937 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3938 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3940 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3942 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3946 let Predicates = [HasAVX, NoBWI] in
3947 def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3948 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3949 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3950 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3952 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3953 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3954 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3955 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3956 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3958 Sched<[WriteVecExtract]>;
3961 let Predicates = [HasAVX, NoBWI] in
3962 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
3964 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
3965 defm PINSRW : sse2_pinsrw, PD;
3967 } // ExeDomain = SSEPackedInt
3969 //===---------------------------------------------------------------------===//
3970 // SSE2 - Packed Mask Creation
3971 //===---------------------------------------------------------------------===//
3973 let ExeDomain = SSEPackedInt in {
3975 def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3977 "pmovmskb\t{$src, $dst|$dst, $src}",
3978 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3979 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
3981 let Predicates = [HasAVX2] in {
3982 def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3984 "pmovmskb\t{$src, $dst|$dst, $src}",
3985 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
3986 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
3989 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
3990 "pmovmskb\t{$src, $dst|$dst, $src}",
3991 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3992 Sched<[WriteVecMOVMSK]>;
3994 } // ExeDomain = SSEPackedInt
3996 //===---------------------------------------------------------------------===//
3997 // SSE2 - Conditional Store
3998 //===---------------------------------------------------------------------===//
4000 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4001 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4002 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4003 (ins VR128:$src, VR128:$mask),
4004 "maskmovdqu\t{$mask, $src|$src, $mask}",
4005 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4007 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4008 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4009 (ins VR128:$src, VR128:$mask),
4010 "maskmovdqu\t{$mask, $src|$src, $mask}",
4011 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4014 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4015 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4016 "maskmovdqu\t{$mask, $src|$src, $mask}",
4017 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4018 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4019 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4020 "maskmovdqu\t{$mask, $src|$src, $mask}",
4021 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4023 } // ExeDomain = SSEPackedInt
4025 //===---------------------------------------------------------------------===//
4026 // SSE2 - Move Doubleword/Quadword
4027 //===---------------------------------------------------------------------===//
4029 //===---------------------------------------------------------------------===//
4030 // Move Int Doubleword to Packed Double Int
4032 let ExeDomain = SSEPackedInt in {
4033 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4034 "movd\t{$src, $dst|$dst, $src}",
4036 (v4i32 (scalar_to_vector GR32:$src)))]>,
4037 VEX, Sched<[WriteVecMoveFromGpr]>;
4038 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4039 "movd\t{$src, $dst|$dst, $src}",
4041 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4042 VEX, Sched<[WriteVecLoad]>;
4043 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4044 "movq\t{$src, $dst|$dst, $src}",
4046 (v2i64 (scalar_to_vector GR64:$src)))]>,
4047 VEX, Sched<[WriteVecMoveFromGpr]>;
4048 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4049 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4050 "movq\t{$src, $dst|$dst, $src}", []>,
4051 VEX, Sched<[WriteVecLoad]>;
4052 let isCodeGenOnly = 1 in
4053 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4054 "movq\t{$src, $dst|$dst, $src}",
4055 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4056 VEX, Sched<[WriteVecMoveFromGpr]>;
4058 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4059 "movd\t{$src, $dst|$dst, $src}",
4061 (v4i32 (scalar_to_vector GR32:$src)))]>,
4062 Sched<[WriteVecMoveFromGpr]>;
4063 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4064 "movd\t{$src, $dst|$dst, $src}",
4066 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4067 Sched<[WriteVecLoad]>;
4068 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4069 "movq\t{$src, $dst|$dst, $src}",
4071 (v2i64 (scalar_to_vector GR64:$src)))]>,
4072 Sched<[WriteVecMoveFromGpr]>;
4073 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4074 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4075 "movq\t{$src, $dst|$dst, $src}", []>,
4076 Sched<[WriteVecLoad]>;
4077 let isCodeGenOnly = 1 in
4078 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4079 "movq\t{$src, $dst|$dst, $src}",
4080 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4081 Sched<[WriteVecMoveFromGpr]>;
4082 } // ExeDomain = SSEPackedInt
4084 //===---------------------------------------------------------------------===//
4085 // Move Int Doubleword to Single Scalar
4087 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4088 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4089 "movd\t{$src, $dst|$dst, $src}",
4090 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4091 VEX, Sched<[WriteVecMoveFromGpr]>;
4093 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4094 "movd\t{$src, $dst|$dst, $src}",
4095 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4096 Sched<[WriteVecMoveFromGpr]>;
4098 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4100 //===---------------------------------------------------------------------===//
4101 // Move Packed Doubleword Int to Packed Double Int
4103 let ExeDomain = SSEPackedInt in {
4104 def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4105 "movd\t{$src, $dst|$dst, $src}",
4106 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4108 Sched<[WriteVecMoveToGpr]>;
4109 def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
4110 (ins i32mem:$dst, VR128:$src),
4111 "movd\t{$src, $dst|$dst, $src}",
4112 [(store (i32 (extractelt (v4i32 VR128:$src),
4113 (iPTR 0))), addr:$dst)]>,
4114 VEX, Sched<[WriteVecStore]>;
4115 def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4116 "movd\t{$src, $dst|$dst, $src}",
4117 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4119 Sched<[WriteVecMoveToGpr]>;
4120 def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4121 "movd\t{$src, $dst|$dst, $src}",
4122 [(store (i32 (extractelt (v4i32 VR128:$src),
4123 (iPTR 0))), addr:$dst)]>,
4124 Sched<[WriteVecStore]>;
4125 } // ExeDomain = SSEPackedInt
4127 //===---------------------------------------------------------------------===//
4128 // Move Packed Doubleword Int first element to Doubleword Int
4130 let ExeDomain = SSEPackedInt in {
4131 let SchedRW = [WriteVecMoveToGpr] in {
4132 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4133 "movq\t{$src, $dst|$dst, $src}",
4134 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4138 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4139 "movq\t{$src, $dst|$dst, $src}",
4140 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4144 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4145 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4146 (ins i64mem:$dst, VR128:$src),
4147 "movq\t{$src, $dst|$dst, $src}", []>,
4148 VEX, Sched<[WriteVecStore]>;
4149 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4150 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4151 "movq\t{$src, $dst|$dst, $src}", []>,
4152 Sched<[WriteVecStore]>;
4153 } // ExeDomain = SSEPackedInt
4155 //===---------------------------------------------------------------------===//
4156 // Bitcast FR64 <-> GR64
4158 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4159 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4160 "movq\t{$src, $dst|$dst, $src}",
4161 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4162 VEX, Sched<[WriteVecMoveToGpr]>;
4164 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4165 "movq\t{$src, $dst|$dst, $src}",
4166 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4167 Sched<[WriteVecMoveToGpr]>;
4168 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4170 //===---------------------------------------------------------------------===//
4171 // Move Scalar Single to Double Int
4173 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4174 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4175 "movd\t{$src, $dst|$dst, $src}",
4176 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4177 VEX, Sched<[WriteVecMoveToGpr]>;
4178 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4179 "movd\t{$src, $dst|$dst, $src}",
4180 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4181 Sched<[WriteVecMoveToGpr]>;
4182 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4184 let Predicates = [UseAVX] in {
4185 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4186 (VMOVDI2PDIrr GR32:$src)>;
4188 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4189 (VMOV64toPQIrr GR64:$src)>;
4191 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4192 // These instructions also write zeros in the high part of a 256-bit register.
4193 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4194 (VMOVDI2PDIrm addr:$src)>;
4195 def : Pat<(v8i32 (X86vzload32 addr:$src)),
4196 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4199 let Predicates = [UseSSE2] in {
4200 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4201 (MOVDI2PDIrr GR32:$src)>;
4203 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4204 (MOV64toPQIrr GR64:$src)>;
4205 def : Pat<(v4i32 (X86vzload32 addr:$src)),
4206 (MOVDI2PDIrm addr:$src)>;
4209 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4210 // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4212 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4213 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4214 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4215 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4216 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4217 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4218 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4219 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4220 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4222 //===---------------------------------------------------------------------===//
4223 // SSE2 - Move Quadword
4224 //===---------------------------------------------------------------------===//
4226 //===---------------------------------------------------------------------===//
4227 // Move Quadword Int to Packed Quadword Int
4230 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4231 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4232 "vmovq\t{$src, $dst|$dst, $src}",
4234 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4235 VEX, Requires<[UseAVX]>, VEX_WIG;
4236 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4237 "movq\t{$src, $dst|$dst, $src}",
4239 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4240 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4241 } // ExeDomain, SchedRW
4243 //===---------------------------------------------------------------------===//
4244 // Move Packed Quadword Int to Quadword Int
4246 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4247 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4248 "movq\t{$src, $dst|$dst, $src}",
4249 [(store (i64 (extractelt (v2i64 VR128:$src),
4250 (iPTR 0))), addr:$dst)]>,
4252 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4253 "movq\t{$src, $dst|$dst, $src}",
4254 [(store (i64 (extractelt (v2i64 VR128:$src),
4255 (iPTR 0))), addr:$dst)]>;
4256 } // ExeDomain, SchedRW
4258 // For disassembler only
4259 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4260 SchedRW = [SchedWriteVecLogic.XMM] in {
4261 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4262 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4263 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4264 "movq\t{$src, $dst|$dst, $src}", []>;
4267 def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4268 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4269 def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4270 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4272 let Predicates = [UseAVX] in {
4273 def : Pat<(v2i64 (X86vzload64 addr:$src)),
4274 (VMOVQI2PQIrm addr:$src)>;
4275 def : Pat<(v4i64 (X86vzload64 addr:$src)),
4276 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4278 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4279 (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4282 let Predicates = [UseSSE2] in {
4283 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4285 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4286 (MOVPQI2QImr addr:$dst, VR128:$src)>;
4289 //===---------------------------------------------------------------------===//
4290 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4291 // IA32 document. movq xmm1, xmm2 does clear the high bits.
4293 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4294 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4295 "vmovq\t{$src, $dst|$dst, $src}",
4296 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4297 XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4298 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4299 "movq\t{$src, $dst|$dst, $src}",
4300 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4301 XS, Requires<[UseSSE2]>;
4302 } // ExeDomain, SchedRW
4304 let Predicates = [UseAVX] in {
4305 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4306 (VMOVZPQILo2PQIrr VR128:$src)>;
4308 let Predicates = [UseSSE2] in {
4309 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4310 (MOVZPQILo2PQIrr VR128:$src)>;
4313 let Predicates = [UseAVX] in {
4314 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4315 (SUBREG_TO_REG (i32 0),
4316 (v2f64 (VMOVZPQILo2PQIrr
4317 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4319 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4320 (SUBREG_TO_REG (i32 0),
4321 (v2i64 (VMOVZPQILo2PQIrr
4322 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4326 //===---------------------------------------------------------------------===//
4327 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4328 //===---------------------------------------------------------------------===//
4330 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4331 ValueType vt, RegisterClass RC, PatFrag mem_frag,
4332 X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4333 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4334 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4335 [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4337 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4338 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4339 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4340 Sched<[sched.Folded]>;
4343 let Predicates = [HasAVX, NoVLX] in {
4344 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4345 v4f32, VR128, loadv4f32, f128mem,
4346 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4347 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4348 v4f32, VR128, loadv4f32, f128mem,
4349 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4350 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4351 v8f32, VR256, loadv8f32, f256mem,
4352 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4353 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4354 v8f32, VR256, loadv8f32, f256mem,
4355 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4357 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4358 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4359 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4360 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4362 let Predicates = [HasAVX, NoVLX] in {
4363 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4364 (VMOVSHDUPrr VR128:$src)>;
4365 def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4366 (VMOVSHDUPrm addr:$src)>;
4367 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4368 (VMOVSLDUPrr VR128:$src)>;
4369 def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4370 (VMOVSLDUPrm addr:$src)>;
4371 def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4372 (VMOVSHDUPYrr VR256:$src)>;
4373 def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4374 (VMOVSHDUPYrm addr:$src)>;
4375 def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4376 (VMOVSLDUPYrr VR256:$src)>;
4377 def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4378 (VMOVSLDUPYrm addr:$src)>;
4381 let Predicates = [UseSSE3] in {
4382 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4383 (MOVSHDUPrr VR128:$src)>;
4384 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4385 (MOVSHDUPrm addr:$src)>;
4386 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4387 (MOVSLDUPrr VR128:$src)>;
4388 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4389 (MOVSLDUPrm addr:$src)>;
4392 //===---------------------------------------------------------------------===//
4393 // SSE3 - Replicate Double FP - MOVDDUP
4394 //===---------------------------------------------------------------------===//
4396 multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4397 def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4398 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4399 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4401 def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4402 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4405 (scalar_to_vector (loadf64 addr:$src)))))]>,
4406 Sched<[sched.XMM.Folded]>;
4409 // FIXME: Merge with above classes when there are patterns for the ymm version
4410 multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4411 def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4412 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4413 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4415 def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4416 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4418 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4419 Sched<[sched.YMM.Folded]>;
4422 let Predicates = [HasAVX, NoVLX] in {
4423 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4425 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4426 VEX, VEX_L, VEX_WIG;
4429 defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4432 let Predicates = [HasAVX, NoVLX] in {
4433 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4434 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4437 let Predicates = [UseSSE3] in {
4438 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4439 (MOVDDUPrm addr:$src)>;
4442 //===---------------------------------------------------------------------===//
4443 // SSE3 - Move Unaligned Integer
4444 //===---------------------------------------------------------------------===//
4446 let Predicates = [HasAVX] in {
4447 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4448 "vlddqu\t{$src, $dst|$dst, $src}",
4449 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4450 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4451 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4452 "vlddqu\t{$src, $dst|$dst, $src}",
4453 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4454 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4457 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4458 "lddqu\t{$src, $dst|$dst, $src}",
4459 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4460 Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4462 //===---------------------------------------------------------------------===//
4463 // SSE3 - Arithmetic
4464 //===---------------------------------------------------------------------===//
4466 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4467 X86MemOperand x86memop, X86FoldableSchedWrite sched,
4468 PatFrag ld_frag, bit Is2Addr = 1> {
4469 let Uses = [MXCSR], mayRaiseFPException = 1 in {
4470 def rr : I<0xD0, MRMSrcReg,
4471 (outs RC:$dst), (ins RC:$src1, RC:$src2),
4473 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4474 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4475 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4477 def rm : I<0xD0, MRMSrcMem,
4478 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4480 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4481 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4482 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4483 Sched<[sched.Folded, sched.ReadAfterFold]>;
4487 let Predicates = [HasAVX] in {
4488 let ExeDomain = SSEPackedSingle in {
4489 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4490 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4491 XD, VEX_4V, VEX_WIG;
4492 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4493 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4494 XD, VEX_4V, VEX_L, VEX_WIG;
4496 let ExeDomain = SSEPackedDouble in {
4497 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4498 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4499 PD, VEX_4V, VEX_WIG;
4500 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4501 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4502 PD, VEX_4V, VEX_L, VEX_WIG;
4505 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4506 let ExeDomain = SSEPackedSingle in
4507 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4508 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4509 let ExeDomain = SSEPackedDouble in
4510 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4511 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4514 //===---------------------------------------------------------------------===//
4515 // SSE3 Instructions
4516 //===---------------------------------------------------------------------===//
4519 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4520 X86MemOperand x86memop, SDNode OpNode,
4521 X86FoldableSchedWrite sched, PatFrag ld_frag,
4523 let Uses = [MXCSR], mayRaiseFPException = 1 in {
4524 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4526 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4527 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4528 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4531 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4533 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4534 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4535 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4536 Sched<[sched.Folded, sched.ReadAfterFold]>;
4539 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4540 X86MemOperand x86memop, SDNode OpNode,
4541 X86FoldableSchedWrite sched, PatFrag ld_frag,
4543 let Uses = [MXCSR], mayRaiseFPException = 1 in {
4544 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4546 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4547 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4548 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4551 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4553 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4554 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4555 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4556 Sched<[sched.Folded, sched.ReadAfterFold]>;
4560 let Predicates = [HasAVX] in {
4561 let ExeDomain = SSEPackedSingle in {
4562 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4563 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4564 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4565 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4566 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4567 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4568 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4569 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4571 let ExeDomain = SSEPackedDouble in {
4572 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4573 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4574 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4575 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4576 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4577 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4578 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4579 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4583 let Constraints = "$src1 = $dst" in {
4584 let ExeDomain = SSEPackedSingle in {
4585 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4586 WriteFHAdd, memopv4f32>;
4587 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4588 WriteFHAdd, memopv4f32>;
4590 let ExeDomain = SSEPackedDouble in {
4591 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4592 WriteFHAdd, memopv2f64>;
4593 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4594 WriteFHAdd, memopv2f64>;
4598 //===---------------------------------------------------------------------===//
4599 // SSSE3 - Packed Absolute Instructions
4600 //===---------------------------------------------------------------------===//
4602 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4603 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4604 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4605 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4607 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4608 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4611 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4613 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4615 (vt (OpNode (ld_frag addr:$src))))]>,
4616 Sched<[sched.XMM.Folded]>;
4619 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4620 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4621 SDNode OpNode, X86SchedWriteWidths sched> {
4622 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4624 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4625 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4628 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4630 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4632 (vt (OpNode (load addr:$src))))]>,
4633 Sched<[sched.YMM.Folded]>;
4636 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4637 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4638 load>, VEX, VEX_WIG;
4639 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4640 load>, VEX, VEX_WIG;
4642 let Predicates = [HasAVX, NoVLX] in {
4643 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4644 load>, VEX, VEX_WIG;
4646 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4647 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4648 VEX, VEX_L, VEX_WIG;
4649 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4650 VEX, VEX_L, VEX_WIG;
4652 let Predicates = [HasAVX2, NoVLX] in {
4653 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4654 VEX, VEX_L, VEX_WIG;
4657 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4659 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4661 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4664 //===---------------------------------------------------------------------===//
4665 // SSSE3 - Packed Binary Operator Instructions
4666 //===---------------------------------------------------------------------===//
4668 /// SS3I_binop_rm - Simple SSSE3 bin op
4669 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4670 ValueType DstVT, ValueType OpVT, RegisterClass RC,
4671 PatFrag memop_frag, X86MemOperand x86memop,
4672 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4673 let isCommutable = 1 in
4674 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4675 (ins RC:$src1, RC:$src2),
4677 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4678 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4679 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4681 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4682 (ins RC:$src1, x86memop:$src2),
4684 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4685 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4687 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4688 Sched<[sched.Folded, sched.ReadAfterFold]>;
4691 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4692 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4693 Intrinsic IntId128, X86FoldableSchedWrite sched,
4694 PatFrag ld_frag, bit Is2Addr = 1> {
4695 let isCommutable = 1 in
4696 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4697 (ins VR128:$src1, VR128:$src2),
4699 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4700 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4701 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4703 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4704 (ins VR128:$src1, i128mem:$src2),
4706 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4707 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4709 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4710 Sched<[sched.Folded, sched.ReadAfterFold]>;
4713 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4715 X86FoldableSchedWrite sched> {
4716 let isCommutable = 1 in
4717 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4718 (ins VR256:$src1, VR256:$src2),
4719 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4720 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4722 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4723 (ins VR256:$src1, i256mem:$src2),
4724 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4726 (IntId256 VR256:$src1, (load addr:$src2)))]>,
4727 Sched<[sched.Folded, sched.ReadAfterFold]>;
4730 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4731 let isCommutable = 0 in {
4732 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4733 VR128, load, i128mem,
4734 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4735 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4736 v16i8, VR128, load, i128mem,
4737 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4739 defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4740 VR128, load, i128mem,
4741 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4744 let ImmT = NoImm, Predicates = [HasAVX] in {
4745 let isCommutable = 0 in {
4746 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4748 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4749 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4751 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4752 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4754 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4755 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4757 SchedWritePHAdd.XMM, 0>, VEX_4V;
4758 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
4759 int_x86_ssse3_psign_b_128,
4760 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4761 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
4762 int_x86_ssse3_psign_w_128,
4763 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4764 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
4765 int_x86_ssse3_psign_d_128,
4766 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4767 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
4768 int_x86_ssse3_phadd_sw_128,
4769 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4770 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
4771 int_x86_ssse3_phsub_sw_128,
4772 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4776 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4777 let isCommutable = 0 in {
4778 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4779 VR256, load, i256mem,
4780 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4781 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4782 v32i8, VR256, load, i256mem,
4783 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4785 defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4786 VR256, load, i256mem,
4787 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4790 let ImmT = NoImm, Predicates = [HasAVX2] in {
4791 let isCommutable = 0 in {
4792 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4793 VR256, load, i256mem,
4794 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4795 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4797 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4798 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4799 VR256, load, i256mem,
4800 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4801 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4803 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
4804 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4805 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4806 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4807 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4808 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4809 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4810 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4811 int_x86_avx2_phadd_sw,
4812 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4813 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4814 int_x86_avx2_phsub_sw,
4815 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4819 // None of these have i8 immediate fields.
4820 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4821 let isCommutable = 0 in {
4822 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4823 memop, i128mem, SchedWritePHAdd.XMM>;
4824 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4825 memop, i128mem, SchedWritePHAdd.XMM>;
4826 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4827 memop, i128mem, SchedWritePHAdd.XMM>;
4828 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4829 memop, i128mem, SchedWritePHAdd.XMM>;
4830 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4831 SchedWriteVecALU.XMM, memop>;
4832 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4833 SchedWriteVecALU.XMM, memop>;
4834 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4835 SchedWriteVecALU.XMM, memop>;
4836 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4837 memop, i128mem, SchedWriteVarShuffle.XMM>;
4838 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
4839 int_x86_ssse3_phadd_sw_128,
4840 SchedWritePHAdd.XMM, memop>;
4841 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
4842 int_x86_ssse3_phsub_sw_128,
4843 SchedWritePHAdd.XMM, memop>;
4844 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4845 v16i8, VR128, memop, i128mem,
4846 SchedWriteVecIMul.XMM>;
4848 defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4849 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4852 //===---------------------------------------------------------------------===//
4853 // SSSE3 - Packed Align Instruction Patterns
4854 //===---------------------------------------------------------------------===//
4856 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4857 PatFrag memop_frag, X86MemOperand x86memop,
4858 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4859 let hasSideEffects = 0 in {
4860 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4861 (ins RC:$src1, RC:$src2, u8imm:$src3),
4863 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4865 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4866 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4869 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4870 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4872 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4874 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4875 [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4876 (memop_frag addr:$src2),
4877 (i8 timm:$src3))))]>,
4878 Sched<[sched.Folded, sched.ReadAfterFold]>;
4882 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4883 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4884 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4885 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4886 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4887 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4888 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4889 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4890 SchedWriteShuffle.XMM>;
4892 //===---------------------------------------------------------------------===//
4893 // SSSE3 - Thread synchronization
4894 //===---------------------------------------------------------------------===//
4896 let SchedRW = [WriteSystem] in {
4897 let Uses = [EAX, ECX, EDX] in
4898 def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4899 TB, Requires<[HasSSE3, Not64BitMode]>;
4900 let Uses = [RAX, ECX, EDX] in
4901 def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4902 TB, Requires<[HasSSE3, In64BitMode]>;
4904 let Uses = [ECX, EAX] in
4905 def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
4906 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4909 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4910 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4912 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4913 Requires<[Not64BitMode]>;
4914 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4915 Requires<[In64BitMode]>;
4917 //===----------------------------------------------------------------------===//
4918 // SSE4.1 - Packed Move with Sign/Zero Extend
4919 // NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4920 //===----------------------------------------------------------------------===//
4922 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4923 RegisterClass OutRC, RegisterClass InRC,
4924 X86FoldableSchedWrite sched> {
4925 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4926 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4929 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4930 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4931 Sched<[sched.Folded]>;
4934 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4935 X86MemOperand MemOp, X86MemOperand MemYOp,
4937 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
4938 SchedWriteShuffle.XMM>;
4939 let Predicates = [HasAVX, prd] in
4940 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
4941 VR128, VR128, SchedWriteShuffle.XMM>,
4943 let Predicates = [HasAVX2, prd] in
4944 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
4945 VR256, VR128, WriteShuffle256>,
4946 VEX, VEX_L, VEX_WIG;
4949 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4950 X86MemOperand MemYOp, Predicate prd> {
4951 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
4952 MemOp, MemYOp, prd>;
4953 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
4954 !strconcat("pmovzx", OpcodeStr),
4955 MemOp, MemYOp, prd>;
4958 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
4959 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
4960 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
4962 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
4963 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
4965 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
4968 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
4969 SDNode ExtOp, SDNode InVecOp> {
4970 // Register-Register patterns
4971 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4972 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
4973 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
4975 let Predicates = [HasAVX2, NoVLX] in {
4976 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
4977 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
4978 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
4979 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
4981 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
4982 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
4983 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
4984 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
4986 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
4987 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
4990 // Simple Register-Memory patterns
4991 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4992 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4993 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4995 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
4996 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4999 let Predicates = [HasAVX2, NoVLX] in {
5000 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5001 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5002 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5003 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5005 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5006 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5007 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5008 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5010 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5011 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5014 // AVX2 Register-Memory patterns
5015 let Predicates = [HasAVX2, NoVLX] in {
5016 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5017 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5019 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5020 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5021 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5022 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5023 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5024 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5026 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5027 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5029 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5030 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5031 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
5032 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5034 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5035 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5036 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5037 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5038 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5039 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5043 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5044 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5046 // SSE4.1/AVX patterns.
5047 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5049 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5050 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5051 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5053 let Predicates = [HasAVX, NoVLX] in {
5054 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5055 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5056 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5057 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5059 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5060 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5061 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5062 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5064 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5065 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5067 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5068 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5069 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5071 let Predicates = [HasAVX, NoVLX] in {
5072 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5073 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5074 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5075 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5077 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5078 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5079 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5080 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5082 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5083 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5085 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5086 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5087 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5088 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5089 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5090 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5091 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5092 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5093 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5095 let Predicates = [HasAVX, NoVLX] in {
5096 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5097 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5098 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5099 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5100 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5101 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5103 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5104 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5105 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5106 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5108 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5109 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5110 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5111 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5112 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5113 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5114 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5115 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5117 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5118 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5119 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5120 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5121 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5122 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5124 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5125 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5126 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5127 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5128 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5129 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5130 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5131 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5135 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5136 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5138 let Predicates = [UseSSE41] in {
5139 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5140 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5143 //===----------------------------------------------------------------------===//
5144 // SSE4.1 - Extract Instructions
5145 //===----------------------------------------------------------------------===//
5147 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5148 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5149 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5150 (ins VR128:$src1, u8imm:$src2),
5151 !strconcat(OpcodeStr,
5152 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5153 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5155 Sched<[WriteVecExtract]>;
5156 let hasSideEffects = 0, mayStore = 1 in
5157 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5158 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5159 !strconcat(OpcodeStr,
5160 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5161 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5162 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5165 let Predicates = [HasAVX, NoBWI] in
5166 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5168 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
5171 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5172 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5173 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5174 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5175 (ins VR128:$src1, u8imm:$src2),
5176 !strconcat(OpcodeStr,
5177 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5178 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5180 let hasSideEffects = 0, mayStore = 1 in
5181 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5182 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5183 !strconcat(OpcodeStr,
5184 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5185 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5186 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5189 let Predicates = [HasAVX, NoBWI] in
5190 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5192 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
5195 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5196 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5197 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5198 (ins VR128:$src1, u8imm:$src2),
5199 !strconcat(OpcodeStr,
5200 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5202 (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5203 Sched<[WriteVecExtract]>;
5204 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5205 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5206 !strconcat(OpcodeStr,
5207 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5208 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5209 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5212 let Predicates = [HasAVX, NoDQI] in
5213 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5215 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
5217 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5218 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5219 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5220 (ins VR128:$src1, u8imm:$src2),
5221 !strconcat(OpcodeStr,
5222 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5224 (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5225 Sched<[WriteVecExtract]>;
5226 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5227 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5228 !strconcat(OpcodeStr,
5229 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5230 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5231 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5234 let Predicates = [HasAVX, NoDQI] in
5235 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5237 defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
5239 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5241 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5242 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5243 (ins VR128:$src1, u8imm:$src2),
5244 !strconcat(OpcodeStr,
5245 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5246 [(set GR32orGR64:$dst,
5247 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5248 Sched<[WriteVecExtract]>;
5249 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5250 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5251 !strconcat(OpcodeStr,
5252 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5253 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5254 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5257 let ExeDomain = SSEPackedSingle in {
5258 let Predicates = [UseAVX] in
5259 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5260 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
5263 //===----------------------------------------------------------------------===//
5264 // SSE4.1 - Insert Instructions
5265 //===----------------------------------------------------------------------===//
5267 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5268 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5269 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5271 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5273 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5275 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5276 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5277 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5278 (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5280 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5282 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5284 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
5285 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5288 let Predicates = [HasAVX, NoBWI] in
5289 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5290 let Constraints = "$src1 = $dst" in
5291 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
5293 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5294 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5295 (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5297 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5299 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5301 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5302 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5303 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5304 (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5306 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5308 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5310 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5311 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5314 let Predicates = [HasAVX, NoDQI] in
5315 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5316 let Constraints = "$src1 = $dst" in
5317 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5319 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5320 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5321 (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5323 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5325 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5327 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5328 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5329 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5330 (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5332 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5334 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5336 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5337 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5340 let Predicates = [HasAVX, NoDQI] in
5341 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5342 let Constraints = "$src1 = $dst" in
5343 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5345 // insertps has a few different modes, there's the first two here below which
5346 // are optimized inserts that won't zero arbitrary elements in the destination
5347 // vector. The next one matches the intrinsic and could zero arbitrary elements
5348 // in the target vector.
5349 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5350 let isCommutable = 1 in
5351 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5352 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5354 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5356 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5358 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5359 Sched<[SchedWriteFShuffle.XMM]>;
5360 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5361 (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5363 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5365 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5367 (X86insertps VR128:$src1,
5368 (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5370 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5373 let ExeDomain = SSEPackedSingle in {
5374 let Predicates = [UseAVX] in
5375 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5377 let Constraints = "$src1 = $dst" in
5378 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5381 //===----------------------------------------------------------------------===//
5382 // SSE4.1 - Round Instructions
5383 //===----------------------------------------------------------------------===//
5385 multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5386 X86MemOperand x86memop, RegisterClass RC,
5387 ValueType VT, PatFrag mem_frag, SDNode OpNode,
5388 X86FoldableSchedWrite sched> {
5389 // Intrinsic operation, reg.
5390 // Vector intrinsic operation, reg
5391 let Uses = [MXCSR], mayRaiseFPException = 1 in {
5392 def r : SS4AIi8<opc, MRMSrcReg,
5393 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5394 !strconcat(OpcodeStr,
5395 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5396 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5399 // Vector intrinsic operation, mem
5400 def m : SS4AIi8<opc, MRMSrcMem,
5401 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5402 !strconcat(OpcodeStr,
5403 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5405 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5406 Sched<[sched.Folded]>;
5410 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5411 string OpcodeStr, X86FoldableSchedWrite sched> {
5412 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5413 def SSr : SS4AIi8<opcss, MRMSrcReg,
5414 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5415 !strconcat(OpcodeStr,
5416 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5417 []>, Sched<[sched]>;
5420 def SSm : SS4AIi8<opcss, MRMSrcMem,
5421 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5422 !strconcat(OpcodeStr,
5423 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5424 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5425 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5427 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5428 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5429 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5430 !strconcat(OpcodeStr,
5431 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5432 []>, Sched<[sched]>;
5435 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5436 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5437 !strconcat(OpcodeStr,
5438 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5439 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5440 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5443 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5444 string OpcodeStr, X86FoldableSchedWrite sched> {
5445 let Uses = [MXCSR], mayRaiseFPException = 1 in {
5446 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5447 def SSr : SS4AIi8<opcss, MRMSrcReg,
5448 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5449 !strconcat(OpcodeStr,
5450 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5451 []>, Sched<[sched]>;
5454 def SSm : SS4AIi8<opcss, MRMSrcMem,
5455 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5456 !strconcat(OpcodeStr,
5457 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5458 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5459 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5461 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5462 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5463 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5464 !strconcat(OpcodeStr,
5465 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5466 []>, Sched<[sched]>;
5469 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5470 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5471 !strconcat(OpcodeStr,
5472 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5473 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5474 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5478 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5479 string OpcodeStr, X86FoldableSchedWrite sched,
5480 ValueType VT32, ValueType VT64,
5481 SDNode OpNode, bit Is2Addr = 1> {
5482 let Uses = [MXCSR], mayRaiseFPException = 1 in {
5483 let ExeDomain = SSEPackedSingle in {
5484 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5485 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5487 !strconcat(OpcodeStr,
5488 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5489 !strconcat(OpcodeStr,
5490 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5491 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5494 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5495 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5497 !strconcat(OpcodeStr,
5498 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5499 !strconcat(OpcodeStr,
5500 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5502 (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
5503 Sched<[sched.Folded, sched.ReadAfterFold]>;
5504 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5506 let ExeDomain = SSEPackedDouble in {
5507 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5508 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5510 !strconcat(OpcodeStr,
5511 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5512 !strconcat(OpcodeStr,
5513 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5514 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5517 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5518 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5520 !strconcat(OpcodeStr,
5521 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5522 !strconcat(OpcodeStr,
5523 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5525 (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
5526 Sched<[sched.Folded, sched.ReadAfterFold]>;
5527 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5531 // FP round - roundss, roundps, roundsd, roundpd
5532 let Predicates = [HasAVX, NoVLX] in {
5533 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5535 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5536 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5538 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5539 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5540 VEX, VEX_L, VEX_WIG;
5543 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5544 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5545 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5547 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5548 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5549 VEX, VEX_L, VEX_WIG;
5552 let Predicates = [UseAVX] in {
5553 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5554 v4f32, v2f64, X86RndScales, 0>,
5555 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5556 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5557 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5560 let Predicates = [UseAVX] in {
5561 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5562 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5563 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5564 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5567 let Predicates = [UseAVX, OptForSize] in {
5568 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5569 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5570 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5571 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5574 let ExeDomain = SSEPackedSingle in
5575 defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5576 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5577 let ExeDomain = SSEPackedDouble in
5578 defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5579 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5581 defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5583 let Constraints = "$src1 = $dst" in
5584 defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5585 v4f32, v2f64, X86RndScales>;
5587 let Predicates = [UseSSE41] in {
5588 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5589 (ROUNDSSr FR32:$src1, timm:$src2)>;
5590 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5591 (ROUNDSDr FR64:$src1, timm:$src2)>;
5594 let Predicates = [UseSSE41, OptForSize] in {
5595 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5596 (ROUNDSSm addr:$src1, timm:$src2)>;
5597 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5598 (ROUNDSDm addr:$src1, timm:$src2)>;
5601 //===----------------------------------------------------------------------===//
5602 // SSE4.1 - Packed Bit Test
5603 //===----------------------------------------------------------------------===//
5605 // ptest instruction we'll lower to this in X86ISelLowering primarily from
5606 // the intel intrinsic that corresponds to this.
5607 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5608 def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5609 "vptest\t{$src2, $src1|$src1, $src2}",
5610 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5611 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5612 def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5613 "vptest\t{$src2, $src1|$src1, $src2}",
5614 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5615 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5618 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5619 "vptest\t{$src2, $src1|$src1, $src2}",
5620 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5621 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5622 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5623 "vptest\t{$src2, $src1|$src1, $src2}",
5624 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5625 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5626 VEX, VEX_L, VEX_WIG;
5629 let Defs = [EFLAGS] in {
5630 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5631 "ptest\t{$src2, $src1|$src1, $src2}",
5632 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5633 Sched<[SchedWriteVecTest.XMM]>;
5634 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5635 "ptest\t{$src2, $src1|$src1, $src2}",
5636 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5637 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5640 // The bit test instructions below are AVX only
5641 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5642 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5643 X86FoldableSchedWrite sched> {
5644 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5645 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5646 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5647 Sched<[sched]>, VEX;
5648 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5649 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5650 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5651 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5654 let Defs = [EFLAGS], Predicates = [HasAVX] in {
5655 let ExeDomain = SSEPackedSingle in {
5656 defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5657 SchedWriteFTest.XMM>;
5658 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5659 SchedWriteFTest.YMM>, VEX_L;
5661 let ExeDomain = SSEPackedDouble in {
5662 defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5663 SchedWriteFTest.XMM>;
5664 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5665 SchedWriteFTest.YMM>, VEX_L;
5669 //===----------------------------------------------------------------------===//
5670 // SSE4.1 - Misc Instructions
5671 //===----------------------------------------------------------------------===//
5673 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5674 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5675 "popcnt{w}\t{$src, $dst|$dst, $src}",
5676 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5677 Sched<[WritePOPCNT]>, OpSize16, XS;
5678 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5679 "popcnt{w}\t{$src, $dst|$dst, $src}",
5680 [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5681 (implicit EFLAGS)]>,
5682 Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5684 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5685 "popcnt{l}\t{$src, $dst|$dst, $src}",
5686 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5687 Sched<[WritePOPCNT]>, OpSize32, XS;
5689 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5690 "popcnt{l}\t{$src, $dst|$dst, $src}",
5691 [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5692 (implicit EFLAGS)]>,
5693 Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5695 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5696 "popcnt{q}\t{$src, $dst|$dst, $src}",
5697 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5698 Sched<[WritePOPCNT]>, XS;
5699 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5700 "popcnt{q}\t{$src, $dst|$dst, $src}",
5701 [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5702 (implicit EFLAGS)]>,
5703 Sched<[WritePOPCNT.Folded]>, XS;
5706 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5707 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5708 SDNode OpNode, PatFrag ld_frag,
5709 X86FoldableSchedWrite Sched> {
5710 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5712 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5713 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5715 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5717 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5719 (v8i16 (OpNode (ld_frag addr:$src))))]>,
5720 Sched<[Sched.Folded]>;
5723 // PHMIN has the same profile as PSAD, thus we use the same scheduling
5724 // model, although the naming is misleading.
5725 let Predicates = [HasAVX] in
5726 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5728 WritePHMINPOS>, VEX, VEX_WIG;
5729 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5733 /// SS48I_binop_rm - Simple SSE41 binary operator.
5734 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5735 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5736 X86MemOperand x86memop, X86FoldableSchedWrite sched,
5738 let isCommutable = 1 in
5739 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5740 (ins RC:$src1, RC:$src2),
5742 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5743 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5744 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5746 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5747 (ins RC:$src1, x86memop:$src2),
5749 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5750 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5752 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5753 Sched<[sched.Folded, sched.ReadAfterFold]>;
5756 let Predicates = [HasAVX, NoVLX] in {
5757 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5758 load, i128mem, SchedWriteVecALU.XMM, 0>,
5760 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5761 load, i128mem, SchedWriteVecALU.XMM, 0>,
5763 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5764 load, i128mem, SchedWriteVecALU.XMM, 0>,
5766 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5767 load, i128mem, SchedWriteVecALU.XMM, 0>,
5769 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5770 load, i128mem, SchedWriteVecIMul.XMM, 0>,
5773 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5774 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5775 load, i128mem, SchedWriteVecALU.XMM, 0>,
5777 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5778 load, i128mem, SchedWriteVecALU.XMM, 0>,
5780 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5781 load, i128mem, SchedWriteVecALU.XMM, 0>,
5783 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5784 load, i128mem, SchedWriteVecALU.XMM, 0>,
5788 let Predicates = [HasAVX2, NoVLX] in {
5789 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5790 load, i256mem, SchedWriteVecALU.YMM, 0>,
5791 VEX_4V, VEX_L, VEX_WIG;
5792 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5793 load, i256mem, SchedWriteVecALU.YMM, 0>,
5794 VEX_4V, VEX_L, VEX_WIG;
5795 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5796 load, i256mem, SchedWriteVecALU.YMM, 0>,
5797 VEX_4V, VEX_L, VEX_WIG;
5798 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5799 load, i256mem, SchedWriteVecALU.YMM, 0>,
5800 VEX_4V, VEX_L, VEX_WIG;
5801 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5802 load, i256mem, SchedWriteVecIMul.YMM, 0>,
5803 VEX_4V, VEX_L, VEX_WIG;
5805 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5806 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5807 load, i256mem, SchedWriteVecALU.YMM, 0>,
5808 VEX_4V, VEX_L, VEX_WIG;
5809 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5810 load, i256mem, SchedWriteVecALU.YMM, 0>,
5811 VEX_4V, VEX_L, VEX_WIG;
5812 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5813 load, i256mem, SchedWriteVecALU.YMM, 0>,
5814 VEX_4V, VEX_L, VEX_WIG;
5815 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5816 load, i256mem, SchedWriteVecALU.YMM, 0>,
5817 VEX_4V, VEX_L, VEX_WIG;
5820 let Constraints = "$src1 = $dst" in {
5821 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5822 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5823 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5824 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5825 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5826 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5827 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5828 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5829 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5830 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5831 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5832 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5833 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5834 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5835 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5836 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5837 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5838 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5841 let Predicates = [HasAVX, NoVLX] in
5842 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5843 load, i128mem, SchedWritePMULLD.XMM, 0>,
5845 let Predicates = [HasAVX] in
5846 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5847 load, i128mem, SchedWriteVecALU.XMM, 0>,
5850 let Predicates = [HasAVX2, NoVLX] in
5851 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5852 load, i256mem, SchedWritePMULLD.YMM, 0>,
5853 VEX_4V, VEX_L, VEX_WIG;
5854 let Predicates = [HasAVX2] in
5855 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5856 load, i256mem, SchedWriteVecALU.YMM, 0>,
5857 VEX_4V, VEX_L, VEX_WIG;
5859 let Constraints = "$src1 = $dst" in {
5860 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5861 memop, i128mem, SchedWritePMULLD.XMM, 1>;
5862 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5863 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5866 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5867 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5868 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5869 X86MemOperand x86memop, bit Is2Addr,
5870 X86FoldableSchedWrite sched> {
5871 let isCommutable = 1 in
5872 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5873 (ins RC:$src1, RC:$src2, u8imm:$src3),
5875 !strconcat(OpcodeStr,
5876 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5877 !strconcat(OpcodeStr,
5878 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5879 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5881 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5882 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5884 !strconcat(OpcodeStr,
5885 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5886 !strconcat(OpcodeStr,
5887 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5889 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5890 Sched<[sched.Folded, sched.ReadAfterFold]>;
5893 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5894 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5895 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5896 X86MemOperand x86memop, bit Is2Addr,
5897 X86FoldableSchedWrite sched> {
5898 let isCommutable = 1 in
5899 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5900 (ins RC:$src1, RC:$src2, u8imm:$src3),
5902 !strconcat(OpcodeStr,
5903 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5904 !strconcat(OpcodeStr,
5905 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5906 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5908 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5909 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5911 !strconcat(OpcodeStr,
5912 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5913 !strconcat(OpcodeStr,
5914 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5916 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
5917 Sched<[sched.Folded, sched.ReadAfterFold]>;
5920 def BlendCommuteImm2 : SDNodeXForm<timm, [{
5921 uint8_t Imm = N->getZExtValue() & 0x03;
5922 return getI8Imm(Imm ^ 0x03, SDLoc(N));
5925 def BlendCommuteImm4 : SDNodeXForm<timm, [{
5926 uint8_t Imm = N->getZExtValue() & 0x0f;
5927 return getI8Imm(Imm ^ 0x0f, SDLoc(N));
5930 def BlendCommuteImm8 : SDNodeXForm<timm, [{
5931 uint8_t Imm = N->getZExtValue() & 0xff;
5932 return getI8Imm(Imm ^ 0xff, SDLoc(N));
5935 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
5936 def BlendScaleImm4 : SDNodeXForm<timm, [{
5937 uint8_t Imm = N->getZExtValue();
5939 for (unsigned i = 0; i != 4; ++i) {
5941 NewImm |= 0x3 << (i * 2);
5943 return getI8Imm(NewImm, SDLoc(N));
5946 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
5947 def BlendScaleImm2 : SDNodeXForm<timm, [{
5948 uint8_t Imm = N->getZExtValue();
5950 for (unsigned i = 0; i != 2; ++i) {
5952 NewImm |= 0xf << (i * 4);
5954 return getI8Imm(NewImm, SDLoc(N));
5957 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
5958 def BlendScaleImm2to4 : SDNodeXForm<timm, [{
5959 uint8_t Imm = N->getZExtValue();
5961 for (unsigned i = 0; i != 2; ++i) {
5963 NewImm |= 0x3 << (i * 2);
5965 return getI8Imm(NewImm, SDLoc(N));
5968 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
5969 def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
5970 uint8_t Imm = N->getZExtValue();
5972 for (unsigned i = 0; i != 4; ++i) {
5974 NewImm |= 0x3 << (i * 2);
5976 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5979 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
5980 def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
5981 uint8_t Imm = N->getZExtValue();
5983 for (unsigned i = 0; i != 2; ++i) {
5985 NewImm |= 0xf << (i * 4);
5987 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5990 // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
5991 def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
5992 uint8_t Imm = N->getZExtValue();
5994 for (unsigned i = 0; i != 2; ++i) {
5996 NewImm |= 0x3 << (i * 2);
5998 return getI8Imm(NewImm ^ 0xf, SDLoc(N));
6001 let Predicates = [HasAVX] in {
6002 let isCommutable = 0 in {
6003 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6004 VR128, load, i128mem, 0,
6005 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6008 let Uses = [MXCSR], mayRaiseFPException = 1 in {
6009 let ExeDomain = SSEPackedSingle in
6010 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6011 VR128, load, f128mem, 0,
6012 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6013 let ExeDomain = SSEPackedDouble in
6014 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6015 VR128, load, f128mem, 0,
6016 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6017 let ExeDomain = SSEPackedSingle in
6018 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6019 VR256, load, i256mem, 0,
6020 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6024 let Predicates = [HasAVX2] in {
6025 let isCommutable = 0 in {
6026 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6027 VR256, load, i256mem, 0,
6028 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6032 let Constraints = "$src1 = $dst" in {
6033 let isCommutable = 0 in {
6034 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6035 VR128, memop, i128mem, 1,
6036 SchedWriteMPSAD.XMM>;
6039 let ExeDomain = SSEPackedSingle in
6040 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6041 VR128, memop, f128mem, 1,
6042 SchedWriteDPPS.XMM>, SIMD_EXC;
6043 let ExeDomain = SSEPackedDouble in
6044 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6045 VR128, memop, f128mem, 1,
6046 SchedWriteDPPD.XMM>, SIMD_EXC;
6049 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6050 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6051 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6052 X86MemOperand x86memop, bit Is2Addr, Domain d,
6053 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6054 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6055 let isCommutable = 1 in
6056 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6057 (ins RC:$src1, RC:$src2, u8imm:$src3),
6059 !strconcat(OpcodeStr,
6060 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6061 !strconcat(OpcodeStr,
6062 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6063 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6065 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6066 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6068 !strconcat(OpcodeStr,
6069 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6070 !strconcat(OpcodeStr,
6071 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6073 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6074 Sched<[sched.Folded, sched.ReadAfterFold]>;
6077 // Pattern to commute if load is in first source.
6078 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6079 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6080 (commuteXForm timm:$src3))>;
6083 let Predicates = [HasAVX] in {
6084 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6085 VR128, load, f128mem, 0, SSEPackedSingle,
6086 SchedWriteFBlend.XMM, BlendCommuteImm4>,
6088 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6089 VR256, load, f256mem, 0, SSEPackedSingle,
6090 SchedWriteFBlend.YMM, BlendCommuteImm8>,
6091 VEX_4V, VEX_L, VEX_WIG;
6092 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6093 VR128, load, f128mem, 0, SSEPackedDouble,
6094 SchedWriteFBlend.XMM, BlendCommuteImm2>,
6096 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6097 VR256, load, f256mem, 0, SSEPackedDouble,
6098 SchedWriteFBlend.YMM, BlendCommuteImm4>,
6099 VEX_4V, VEX_L, VEX_WIG;
6100 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6101 VR128, load, i128mem, 0, SSEPackedInt,
6102 SchedWriteBlend.XMM, BlendCommuteImm8>,
6106 let Predicates = [HasAVX2] in {
6107 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6108 VR256, load, i256mem, 0, SSEPackedInt,
6109 SchedWriteBlend.YMM, BlendCommuteImm8>,
6110 VEX_4V, VEX_L, VEX_WIG;
6113 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6114 // ExecutionDomainFixPass will cleanup domains later on.
6115 let Predicates = [HasAVX1Only] in {
6116 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6117 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6118 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6119 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6120 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6121 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6123 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6124 // it from becoming movsd via commuting under optsize.
6125 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6126 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6127 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6128 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6129 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6130 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6132 def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6133 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6134 def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6135 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6136 def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6137 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6139 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6140 // it from becoming movss via commuting under optsize.
6141 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6142 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6143 def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6144 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6145 def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6146 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6149 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6150 VR128, memop, f128mem, 1, SSEPackedSingle,
6151 SchedWriteFBlend.XMM, BlendCommuteImm4>;
6152 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6153 VR128, memop, f128mem, 1, SSEPackedDouble,
6154 SchedWriteFBlend.XMM, BlendCommuteImm2>;
6155 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6156 VR128, memop, i128mem, 1, SSEPackedInt,
6157 SchedWriteBlend.XMM, BlendCommuteImm8>;
6159 let Predicates = [UseSSE41] in {
6160 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6161 // it from becoming movss via commuting under optsize.
6162 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6163 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6164 def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6165 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6166 def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6167 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6169 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6170 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6171 def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6172 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6173 def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6174 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6177 // For insertion into the zero index (low half) of a 256-bit vector, it is
6178 // more efficient to generate a blend with immediate instead of an insert*128.
6179 let Predicates = [HasAVX] in {
6180 def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6181 (VBLENDPDYrri VR256:$src1,
6182 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6183 VR128:$src2, sub_xmm), 0x3)>;
6184 def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6185 (VBLENDPSYrri VR256:$src1,
6186 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6187 VR128:$src2, sub_xmm), 0xf)>;
6189 def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6190 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6191 VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6192 def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6193 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6194 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6197 /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6198 multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6199 X86MemOperand x86memop, ValueType VT,
6200 PatFrag mem_frag, SDNode OpNode,
6201 X86FoldableSchedWrite sched> {
6202 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6203 (ins RC:$src1, RC:$src2, RC:$src3),
6204 !strconcat(OpcodeStr,
6205 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6206 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6207 SSEPackedInt>, TAPD, VEX_4V,
6210 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6211 (ins RC:$src1, x86memop:$src2, RC:$src3),
6212 !strconcat(OpcodeStr,
6213 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6215 (OpNode RC:$src3, (mem_frag addr:$src2),
6216 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6217 Sched<[sched.Folded, sched.ReadAfterFold,
6219 ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6222 sched.ReadAfterFold]>;
6225 let Predicates = [HasAVX] in {
6226 let ExeDomain = SSEPackedDouble in {
6227 defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6228 v2f64, loadv2f64, X86Blendv,
6229 SchedWriteFVarBlend.XMM>;
6230 defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6231 v4f64, loadv4f64, X86Blendv,
6232 SchedWriteFVarBlend.YMM>, VEX_L;
6233 } // ExeDomain = SSEPackedDouble
6234 let ExeDomain = SSEPackedSingle in {
6235 defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6236 v4f32, loadv4f32, X86Blendv,
6237 SchedWriteFVarBlend.XMM>;
6238 defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6239 v8f32, loadv8f32, X86Blendv,
6240 SchedWriteFVarBlend.YMM>, VEX_L;
6241 } // ExeDomain = SSEPackedSingle
6242 defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6243 v16i8, loadv16i8, X86Blendv,
6244 SchedWriteVarBlend.XMM>;
6247 let Predicates = [HasAVX2] in {
6248 defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6249 v32i8, loadv32i8, X86Blendv,
6250 SchedWriteVarBlend.YMM>, VEX_L;
6253 let Predicates = [HasAVX] in {
6254 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6255 (v4i32 VR128:$src2))),
6256 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6257 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6258 (v2i64 VR128:$src2))),
6259 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6260 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6261 (v8i32 VR256:$src2))),
6262 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6263 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6264 (v4i64 VR256:$src2))),
6265 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6268 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6269 // changed to use blends because blends have better throughput on sandybridge
6270 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6271 let Predicates = [HasAVX, OptForSpeed] in {
6272 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6273 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6274 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6275 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6277 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6278 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6279 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6280 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6281 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6282 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6284 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6285 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6286 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6287 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6288 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6289 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6291 // Move low f32 and clear high bits.
6292 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6293 (SUBREG_TO_REG (i32 0),
6294 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6295 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6296 (i8 1))), sub_xmm)>;
6297 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6298 (SUBREG_TO_REG (i32 0),
6299 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6300 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6301 (i8 3))), sub_xmm)>;
6304 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6305 // changed to use blends because blends have better throughput on sandybridge
6306 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6307 let Predicates = [UseSSE41, OptForSpeed] in {
6308 // With SSE41 we can use blends for these patterns.
6309 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6310 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6311 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6312 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6314 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6315 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6316 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6317 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6318 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6319 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6321 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6322 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6323 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6324 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6325 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6326 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6330 /// SS41I_ternary - SSE 4.1 ternary operator
6331 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6332 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6333 PatFrag mem_frag, X86MemOperand x86memop,
6334 SDNode OpNode, X86FoldableSchedWrite sched> {
6335 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6336 (ins VR128:$src1, VR128:$src2),
6337 !strconcat(OpcodeStr,
6338 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6340 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6343 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6344 (ins VR128:$src1, x86memop:$src2),
6345 !strconcat(OpcodeStr,
6346 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6348 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6349 Sched<[sched.Folded, sched.ReadAfterFold]>;
6353 let ExeDomain = SSEPackedDouble in
6354 defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6355 X86Blendv, SchedWriteFVarBlend.XMM>;
6356 let ExeDomain = SSEPackedSingle in
6357 defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6358 X86Blendv, SchedWriteFVarBlend.XMM>;
6359 defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6360 X86Blendv, SchedWriteVarBlend.XMM>;
6362 // Aliases with the implicit xmm0 argument
6363 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6364 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6365 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6366 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6367 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6368 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6369 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6370 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6371 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6372 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6373 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6374 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6376 let Predicates = [UseSSE41] in {
6377 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6378 (v4i32 VR128:$src2))),
6379 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6380 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6381 (v2i64 VR128:$src2))),
6382 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6385 let AddedComplexity = 400 in { // Prefer non-temporal versions
6387 let Predicates = [HasAVX, NoVLX] in
6388 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6389 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6390 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6391 let Predicates = [HasAVX2, NoVLX] in
6392 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6393 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6394 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6395 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6396 "movntdqa\t{$src, $dst|$dst, $src}", []>,
6397 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6399 let Predicates = [HasAVX2, NoVLX] in {
6400 def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6401 (VMOVNTDQAYrm addr:$src)>;
6402 def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6403 (VMOVNTDQAYrm addr:$src)>;
6404 def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6405 (VMOVNTDQAYrm addr:$src)>;
6406 def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6407 (VMOVNTDQAYrm addr:$src)>;
6408 def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6409 (VMOVNTDQAYrm addr:$src)>;
6410 def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6411 (VMOVNTDQAYrm addr:$src)>;
6414 let Predicates = [HasAVX, NoVLX] in {
6415 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6416 (VMOVNTDQArm addr:$src)>;
6417 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6418 (VMOVNTDQArm addr:$src)>;
6419 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6420 (VMOVNTDQArm addr:$src)>;
6421 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6422 (VMOVNTDQArm addr:$src)>;
6423 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6424 (VMOVNTDQArm addr:$src)>;
6425 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6426 (VMOVNTDQArm addr:$src)>;
6429 let Predicates = [UseSSE41] in {
6430 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6431 (MOVNTDQArm addr:$src)>;
6432 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6433 (MOVNTDQArm addr:$src)>;
6434 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6435 (MOVNTDQArm addr:$src)>;
6436 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6437 (MOVNTDQArm addr:$src)>;
6438 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6439 (MOVNTDQArm addr:$src)>;
6440 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6441 (MOVNTDQArm addr:$src)>;
6444 } // AddedComplexity
6446 //===----------------------------------------------------------------------===//
6447 // SSE4.2 - Compare Instructions
6448 //===----------------------------------------------------------------------===//
6450 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
6451 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6452 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6453 X86MemOperand x86memop, X86FoldableSchedWrite sched,
6455 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6456 (ins RC:$src1, RC:$src2),
6458 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6459 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6460 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6462 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6463 (ins RC:$src1, x86memop:$src2),
6465 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6466 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6468 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6469 Sched<[sched.Folded, sched.ReadAfterFold]>;
6472 let Predicates = [HasAVX] in
6473 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6474 load, i128mem, SchedWriteVecALU.XMM, 0>,
6477 let Predicates = [HasAVX2] in
6478 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6479 load, i256mem, SchedWriteVecALU.YMM, 0>,
6480 VEX_4V, VEX_L, VEX_WIG;
6482 let Constraints = "$src1 = $dst" in
6483 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6484 memop, i128mem, SchedWriteVecALU.XMM>;
6486 //===----------------------------------------------------------------------===//
6487 // SSE4.2 - String/text Processing Instructions
6488 //===----------------------------------------------------------------------===//
6490 multiclass pcmpistrm_SS42AI<string asm> {
6491 def rr : SS42AI<0x62, MRMSrcReg, (outs),
6492 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6493 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6494 []>, Sched<[WritePCmpIStrM]>;
6496 def rm :SS42AI<0x62, MRMSrcMem, (outs),
6497 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6498 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6499 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6502 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6503 let Predicates = [HasAVX] in
6504 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6505 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
6508 multiclass SS42AI_pcmpestrm<string asm> {
6509 def rr : SS42AI<0x60, MRMSrcReg, (outs),
6510 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6511 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6512 []>, Sched<[WritePCmpEStrM]>;
6514 def rm : SS42AI<0x60, MRMSrcMem, (outs),
6515 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6516 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6517 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6520 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6521 let Predicates = [HasAVX] in
6522 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6523 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
6526 multiclass SS42AI_pcmpistri<string asm> {
6527 def rr : SS42AI<0x63, MRMSrcReg, (outs),
6528 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6529 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6530 []>, Sched<[WritePCmpIStrI]>;
6532 def rm : SS42AI<0x63, MRMSrcMem, (outs),
6533 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6534 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6535 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6538 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6539 let Predicates = [HasAVX] in
6540 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6541 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
6544 multiclass SS42AI_pcmpestri<string asm> {
6545 def rr : SS42AI<0x61, MRMSrcReg, (outs),
6546 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6547 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6548 []>, Sched<[WritePCmpEStrI]>;
6550 def rm : SS42AI<0x61, MRMSrcMem, (outs),
6551 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6552 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6553 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6556 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6557 let Predicates = [HasAVX] in
6558 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6559 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
6562 //===----------------------------------------------------------------------===//
6563 // SSE4.2 - CRC Instructions
6564 //===----------------------------------------------------------------------===//
6566 // No CRC instructions have AVX equivalents
6568 // crc intrinsic instruction
6569 // This set of instructions are only rm, the only difference is the size
6571 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6572 RegisterClass RCIn, SDPatternOperator Int> :
6573 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6574 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6575 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6576 Sched<[WriteCRC32]>;
6578 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6579 X86MemOperand x86memop, SDPatternOperator Int> :
6580 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6581 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6582 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6583 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6585 let Constraints = "$src1 = $dst" in {
6586 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6587 int_x86_sse42_crc32_32_8>;
6588 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6589 int_x86_sse42_crc32_32_8>;
6590 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6591 int_x86_sse42_crc32_32_16>, OpSize16;
6592 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6593 int_x86_sse42_crc32_32_16>, OpSize16;
6594 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6595 int_x86_sse42_crc32_32_32>, OpSize32;
6596 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6597 int_x86_sse42_crc32_32_32>, OpSize32;
6598 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6599 int_x86_sse42_crc32_64_64>, REX_W;
6600 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6601 int_x86_sse42_crc32_64_64>, REX_W;
6602 let hasSideEffects = 0 in {
6604 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6606 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6611 //===----------------------------------------------------------------------===//
6612 // SHA-NI Instructions
6613 //===----------------------------------------------------------------------===//
6615 // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6616 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6617 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6618 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6619 (ins VR128:$src1, VR128:$src2),
6621 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6622 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6624 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6625 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6626 T8PS, Sched<[sched]>;
6628 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6629 (ins VR128:$src1, i128mem:$src2),
6631 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6632 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6634 (set VR128:$dst, (IntId VR128:$src1,
6635 (memop addr:$src2), XMM0)),
6636 (set VR128:$dst, (IntId VR128:$src1,
6637 (memop addr:$src2))))]>, T8PS,
6638 Sched<[sched.Folded, sched.ReadAfterFold]>;
6641 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6642 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6643 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6644 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6646 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6647 (i8 timm:$src3)))]>, TAPS,
6648 Sched<[SchedWriteVecIMul.XMM]>;
6649 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6650 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6651 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6653 (int_x86_sha1rnds4 VR128:$src1,
6655 (i8 timm:$src3)))]>, TAPS,
6656 Sched<[SchedWriteVecIMul.XMM.Folded,
6657 SchedWriteVecIMul.XMM.ReadAfterFold]>;
6659 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6660 SchedWriteVecIMul.XMM>;
6661 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6662 SchedWriteVecIMul.XMM>;
6663 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6664 SchedWriteVecIMul.XMM>;
6667 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6668 SchedWriteVecIMul.XMM, 1>;
6670 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6671 SchedWriteVecIMul.XMM>;
6672 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6673 SchedWriteVecIMul.XMM>;
6676 // Aliases with explicit %xmm0
6677 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6678 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6679 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6680 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6682 //===----------------------------------------------------------------------===//
6683 // AES-NI Instructions
6684 //===----------------------------------------------------------------------===//
6686 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6687 Intrinsic IntId, PatFrag ld_frag,
6688 bit Is2Addr = 0, RegisterClass RC = VR128,
6689 X86MemOperand MemOp = i128mem> {
6690 let AsmString = OpcodeStr#
6691 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6692 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6693 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6694 (ins RC:$src1, RC:$src2), "",
6695 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6696 Sched<[WriteAESDecEnc]>;
6697 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6698 (ins RC:$src1, MemOp:$src2), "",
6699 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6700 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6704 // Perform One Round of an AES Encryption/Decryption Flow
6705 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6706 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
6707 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6708 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
6709 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6710 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
6711 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6712 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
6713 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6716 let Predicates = [NoVLX, HasVAES] in {
6717 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
6718 int_x86_aesni_aesenc_256, load, 0, VR256,
6719 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6720 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
6721 int_x86_aesni_aesenclast_256, load, 0, VR256,
6722 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6723 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
6724 int_x86_aesni_aesdec_256, load, 0, VR256,
6725 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6726 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
6727 int_x86_aesni_aesdeclast_256, load, 0, VR256,
6728 i256mem>, VEX_4V, VEX_L, VEX_WIG;
6731 let Constraints = "$src1 = $dst" in {
6732 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
6733 int_x86_aesni_aesenc, memop, 1>;
6734 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
6735 int_x86_aesni_aesenclast, memop, 1>;
6736 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
6737 int_x86_aesni_aesdec, memop, 1>;
6738 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
6739 int_x86_aesni_aesdeclast, memop, 1>;
6742 // Perform the AES InvMixColumn Transformation
6743 let Predicates = [HasAVX, HasAES] in {
6744 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6746 "vaesimc\t{$src1, $dst|$dst, $src1}",
6748 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6750 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6751 (ins i128mem:$src1),
6752 "vaesimc\t{$src1, $dst|$dst, $src1}",
6753 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6754 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6756 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6758 "aesimc\t{$src1, $dst|$dst, $src1}",
6760 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6761 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6762 (ins i128mem:$src1),
6763 "aesimc\t{$src1, $dst|$dst, $src1}",
6764 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6765 Sched<[WriteAESIMC.Folded]>;
6767 // AES Round Key Generation Assist
6768 let Predicates = [HasAVX, HasAES] in {
6769 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6770 (ins VR128:$src1, u8imm:$src2),
6771 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6773 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6774 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6775 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6776 (ins i128mem:$src1, u8imm:$src2),
6777 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6779 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6780 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6782 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6783 (ins VR128:$src1, u8imm:$src2),
6784 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6786 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6787 Sched<[WriteAESKeyGen]>;
6788 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6789 (ins i128mem:$src1, u8imm:$src2),
6790 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6792 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6793 Sched<[WriteAESKeyGen.Folded]>;
6795 //===----------------------------------------------------------------------===//
6796 // PCLMUL Instructions
6797 //===----------------------------------------------------------------------===//
6799 // Immediate transform to help with commuting.
6800 def PCLMULCommuteImm : SDNodeXForm<timm, [{
6801 uint8_t Imm = N->getZExtValue();
6802 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6805 // SSE carry-less Multiplication instructions
6806 let Predicates = [NoAVX, HasPCLMUL] in {
6807 let Constraints = "$src1 = $dst" in {
6808 let isCommutable = 1 in
6809 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6810 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6811 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6813 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6814 Sched<[WriteCLMul]>;
6816 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6817 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6818 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6820 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6822 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6823 } // Constraints = "$src1 = $dst"
6825 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6827 (PCLMULQDQrm VR128:$src1, addr:$src2,
6828 (PCLMULCommuteImm timm:$src3))>;
6829 } // Predicates = [NoAVX, HasPCLMUL]
6832 foreach HI = ["hq","lq"] in
6833 foreach LO = ["hq","lq"] in {
6834 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6835 (PCLMULQDQrr VR128:$dst, VR128:$src,
6836 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6837 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6838 (PCLMULQDQrm VR128:$dst, i128mem:$src,
6839 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6842 // AVX carry-less Multiplication instructions
6843 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6844 PatFrag LdFrag, Intrinsic IntId> {
6845 let isCommutable = 1 in
6846 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6847 (ins RC:$src1, RC:$src2, u8imm:$src3),
6848 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6850 (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6851 Sched<[WriteCLMul]>;
6853 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6854 (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6855 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6857 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6858 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6860 // We can commute a load in the first operand by swapping the sources and
6861 // rotating the immediate.
6862 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6863 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6864 (PCLMULCommuteImm timm:$src3))>;
6867 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6868 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6869 int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6871 let Predicates = [NoVLX, HasVPCLMULQDQ] in
6872 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6873 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6875 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6876 X86MemOperand MemOp, string Hi, string Lo> {
6877 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6878 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6879 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6880 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6881 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6882 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6885 multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6886 X86MemOperand MemOp> {
6887 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6888 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6889 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6890 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6894 defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6895 defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6897 //===----------------------------------------------------------------------===//
6898 // SSE4A Instructions
6899 //===----------------------------------------------------------------------===//
6901 let Predicates = [HasSSE4A] in {
6903 let ExeDomain = SSEPackedInt in {
6904 let Constraints = "$src = $dst" in {
6905 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6906 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6907 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6908 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
6910 PD, Sched<[SchedWriteVecALU.XMM]>;
6911 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
6912 (ins VR128:$src, VR128:$mask),
6913 "extrq\t{$mask, $src|$src, $mask}",
6914 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6916 PD, Sched<[SchedWriteVecALU.XMM]>;
6918 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
6919 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
6920 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
6921 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
6922 timm:$len, timm:$idx))]>,
6923 XD, Sched<[SchedWriteVecALU.XMM]>;
6924 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
6925 (ins VR128:$src, VR128:$mask),
6926 "insertq\t{$mask, $src|$src, $mask}",
6927 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
6929 XD, Sched<[SchedWriteVecALU.XMM]>;
6931 } // ExeDomain = SSEPackedInt
6933 // Non-temporal (unaligned) scalar stores.
6934 let AddedComplexity = 400 in { // Prefer non-temporal versions
6935 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
6936 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
6937 "movntss\t{$src, $dst|$dst, $src}", []>, XS;
6939 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
6940 "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
6943 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
6944 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
6946 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
6947 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
6949 } // AddedComplexity
6952 //===----------------------------------------------------------------------===//
6954 //===----------------------------------------------------------------------===//
6956 //===----------------------------------------------------------------------===//
6957 // VBROADCAST - Load from memory and broadcast to all elements of the
6958 // destination operand
6960 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
6961 X86MemOperand x86memop, ValueType VT,
6962 PatFrag bcast_frag, SchedWrite Sched> :
6963 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
6964 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6965 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
6966 Sched<[Sched]>, VEX;
6968 // AVX2 adds register forms
6969 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
6970 ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
6971 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
6972 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6973 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
6974 Sched<[Sched]>, VEX;
6976 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
6977 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
6978 f32mem, v4f32, X86VBroadcastld32,
6979 SchedWriteFShuffle.XMM.Folded>;
6980 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
6981 f32mem, v8f32, X86VBroadcastld32,
6982 SchedWriteFShuffle.XMM.Folded>, VEX_L;
6984 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
6985 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
6986 v4f64, X86VBroadcastld64,
6987 SchedWriteFShuffle.XMM.Folded>, VEX_L;
6989 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
6990 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
6991 v4f32, v4f32, SchedWriteFShuffle.XMM>;
6992 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
6993 v8f32, v4f32, WriteFShuffle256>, VEX_L;
6995 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
6996 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
6997 v4f64, v2f64, WriteFShuffle256>, VEX_L;
6999 //===----------------------------------------------------------------------===//
7000 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7001 // halves of a 256-bit vector.
7003 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7004 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7006 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7007 Sched<[WriteShuffleLd]>, VEX, VEX_L;
7009 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7010 ExeDomain = SSEPackedSingle in
7011 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7013 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7014 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7016 let Predicates = [HasAVX, NoVLX] in {
7017 def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
7018 (VBROADCASTF128 addr:$src)>;
7019 def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
7020 (VBROADCASTF128 addr:$src)>;
7023 // NOTE: We're using FP instructions here, but execution domain fixing can
7024 // convert to integer when profitable.
7025 let Predicates = [HasAVX, NoVLX] in {
7026 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7027 (VBROADCASTF128 addr:$src)>;
7028 def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
7029 (VBROADCASTF128 addr:$src)>;
7030 def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
7031 (VBROADCASTF128 addr:$src)>;
7032 def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
7033 (VBROADCASTF128 addr:$src)>;
7036 //===----------------------------------------------------------------------===//
7037 // VINSERTF128 - Insert packed floating-point values
7039 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7040 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7041 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7042 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7043 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7045 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7046 (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7047 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7048 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7051 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
7052 // with YMM register containing zero.
7053 // FIXME: Avoid producing vxorps to clear the fake inputs.
7054 let Predicates = [HasAVX1Only] in {
7055 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7058 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7059 PatFrag memop_frag> {
7060 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7062 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7063 (INSERT_get_vinsert128_imm VR256:$ins))>;
7064 def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7065 (From (memop_frag addr:$src2)),
7067 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7068 (INSERT_get_vinsert128_imm VR256:$ins))>;
7071 let Predicates = [HasAVX, NoVLX] in {
7072 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7073 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7076 let Predicates = [HasAVX1Only] in {
7077 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
7078 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>;
7079 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
7080 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>;
7083 //===----------------------------------------------------------------------===//
7084 // VEXTRACTF128 - Extract packed floating-point values
7086 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7087 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7088 (ins VR256:$src1, u8imm:$src2),
7089 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7090 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7092 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7093 (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7094 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7095 []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7098 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7099 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7100 (To (!cast<Instruction>(InstrStr#rr)
7102 (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7103 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7104 (iPTR imm))), addr:$dst),
7105 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7106 (EXTRACT_get_vextract128_imm VR128:$ext))>;
7110 let Predicates = [HasAVX, NoVLX] in {
7111 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7112 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7115 let Predicates = [HasAVX1Only] in {
7116 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
7117 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
7118 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7119 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
7122 //===----------------------------------------------------------------------===//
7123 // VMASKMOV - Conditional SIMD Packed Loads and Stores
7125 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7126 Intrinsic IntLd, Intrinsic IntLd256,
7127 Intrinsic IntSt, Intrinsic IntSt256,
7128 X86SchedWriteMaskMove schedX,
7129 X86SchedWriteMaskMove schedY> {
7130 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7131 (ins VR128:$src1, f128mem:$src2),
7132 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7133 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7134 VEX_4V, Sched<[schedX.RM]>;
7135 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7136 (ins VR256:$src1, f256mem:$src2),
7137 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7138 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7139 VEX_4V, VEX_L, Sched<[schedY.RM]>;
7140 def mr : AVX8I<opc_mr, MRMDestMem, (outs),
7141 (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7142 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7143 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7144 VEX_4V, Sched<[schedX.MR]>;
7145 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7146 (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7147 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7148 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7149 VEX_4V, VEX_L, Sched<[schedY.MR]>;
7152 let ExeDomain = SSEPackedSingle in
7153 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7154 int_x86_avx_maskload_ps,
7155 int_x86_avx_maskload_ps_256,
7156 int_x86_avx_maskstore_ps,
7157 int_x86_avx_maskstore_ps_256,
7158 WriteFMaskMove32, WriteFMaskMove32Y>;
7159 let ExeDomain = SSEPackedDouble in
7160 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7161 int_x86_avx_maskload_pd,
7162 int_x86_avx_maskload_pd_256,
7163 int_x86_avx_maskstore_pd,
7164 int_x86_avx_maskstore_pd_256,
7165 WriteFMaskMove64, WriteFMaskMove64Y>;
7167 //===----------------------------------------------------------------------===//
7168 // VPERMIL - Permute Single and Double Floating-Point Values
7171 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7172 RegisterClass RC, X86MemOperand x86memop_f,
7173 X86MemOperand x86memop_i,
7174 ValueType f_vt, ValueType i_vt,
7175 X86FoldableSchedWrite sched,
7176 X86FoldableSchedWrite varsched> {
7177 let Predicates = [HasAVX, NoVLX] in {
7178 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7179 (ins RC:$src1, RC:$src2),
7180 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7181 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7183 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7184 (ins RC:$src1, x86memop_i:$src2),
7185 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7186 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7187 (i_vt (load addr:$src2)))))]>, VEX_4V,
7188 Sched<[varsched.Folded, sched.ReadAfterFold]>;
7190 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7191 (ins RC:$src1, u8imm:$src2),
7192 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7193 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7195 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7196 (ins x86memop_f:$src1, u8imm:$src2),
7197 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7199 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7200 Sched<[sched.Folded]>;
7201 }// Predicates = [HasAVX, NoVLX]
7204 let ExeDomain = SSEPackedSingle in {
7205 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7206 v4f32, v4i32, SchedWriteFShuffle.XMM,
7207 SchedWriteFVarShuffle.XMM>;
7208 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7209 v8f32, v8i32, SchedWriteFShuffle.YMM,
7210 SchedWriteFVarShuffle.YMM>, VEX_L;
7212 let ExeDomain = SSEPackedDouble in {
7213 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7214 v2f64, v2i64, SchedWriteFShuffle.XMM,
7215 SchedWriteFVarShuffle.XMM>;
7216 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7217 v4f64, v4i64, SchedWriteFShuffle.YMM,
7218 SchedWriteFVarShuffle.YMM>, VEX_L;
7221 //===----------------------------------------------------------------------===//
7222 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7225 let ExeDomain = SSEPackedSingle in {
7226 let isCommutable = 1 in
7227 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7228 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7229 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7230 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7231 (i8 timm:$src3))))]>, VEX_4V, VEX_L,
7232 Sched<[WriteFShuffle256]>;
7233 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7234 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7235 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7236 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7237 (i8 timm:$src3)))]>, VEX_4V, VEX_L,
7238 Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7241 // Immediate transform to help with commuting.
7242 def Perm2XCommuteImm : SDNodeXForm<timm, [{
7243 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7246 let Predicates = [HasAVX] in {
7247 // Pattern with load in other operand.
7248 def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7249 VR256:$src1, (i8 timm:$imm))),
7250 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7253 let Predicates = [HasAVX1Only] in {
7254 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7255 (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
7256 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7257 (loadv4i64 addr:$src2), (i8 timm:$imm))),
7258 (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
7259 // Pattern with load in other operand.
7260 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7261 VR256:$src1, (i8 timm:$imm))),
7262 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7265 //===----------------------------------------------------------------------===//
7266 // VZERO - Zero YMM registers
7267 // Note: These instruction do not affect the YMM16-YMM31.
7270 let SchedRW = [WriteSystem] in {
7271 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7272 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7273 // Zero All YMM registers
7274 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7275 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7276 Requires<[HasAVX]>, VEX_WIG;
7278 // Zero Upper bits of YMM registers
7279 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7280 [(int_x86_avx_vzeroupper)]>, PS, VEX,
7281 Requires<[HasAVX]>, VEX_WIG;
7285 //===----------------------------------------------------------------------===//
7286 // Half precision conversion instructions
7289 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7290 X86FoldableSchedWrite sched> {
7291 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7292 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7293 [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
7294 T8PD, VEX, Sched<[sched]>;
7295 let hasSideEffects = 0, mayLoad = 1 in
7296 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7297 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7298 []>, T8PD, VEX, Sched<[sched.Folded]>;
7301 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7302 SchedWrite RR, SchedWrite MR> {
7303 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7304 (ins RC:$src1, i32u8imm:$src2),
7305 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7306 [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
7307 TAPD, VEX, Sched<[RR]>;
7308 let hasSideEffects = 0, mayStore = 1 in
7309 def mr : Ii8<0x1D, MRMDestMem, (outs),
7310 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7311 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7312 TAPD, VEX, Sched<[MR]>;
7315 let Predicates = [HasF16C, NoVLX] in {
7316 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7317 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7318 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7319 WriteCvtPS2PHSt>, SIMD_EXC;
7320 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7321 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7323 // Pattern match vcvtph2ps of a scalar i64 load.
7324 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7325 (VCVTPH2PSrm addr:$src)>;
7326 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
7327 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7328 (VCVTPH2PSrm addr:$src)>;
7329 def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
7330 (VCVTPH2PSYrm addr:$src)>;
7332 def : Pat<(store (f64 (extractelt
7333 (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7334 (iPTR 0))), addr:$dst),
7335 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7336 def : Pat<(store (i64 (extractelt
7337 (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7338 (iPTR 0))), addr:$dst),
7339 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7340 def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7341 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7344 //===----------------------------------------------------------------------===//
7345 // AVX2 Instructions
7346 //===----------------------------------------------------------------------===//
7348 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7349 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7350 ValueType OpVT, X86FoldableSchedWrite sched,
7352 X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7353 let isCommutable = 1 in
7354 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7355 (ins RC:$src1, RC:$src2, u8imm:$src3),
7356 !strconcat(OpcodeStr,
7357 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7358 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7359 Sched<[sched]>, VEX_4V;
7360 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7361 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7362 !strconcat(OpcodeStr,
7363 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7365 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7366 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7368 // Pattern to commute if load is in first source.
7369 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7370 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7371 (commuteXForm timm:$src3))>;
7374 let Predicates = [HasAVX2] in {
7375 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7376 SchedWriteBlend.XMM, VR128, i128mem,
7378 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7379 SchedWriteBlend.YMM, VR256, i256mem,
7380 BlendCommuteImm8>, VEX_L;
7382 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7383 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7384 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7385 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7386 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7387 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7389 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7390 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7391 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7392 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7393 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7394 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7397 // For insertion into the zero index (low half) of a 256-bit vector, it is
7398 // more efficient to generate a blend with immediate instead of an insert*128.
7399 // NOTE: We're using FP instructions here, but execution domain fixing should
7400 // take care of using integer instructions when profitable.
7401 let Predicates = [HasAVX] in {
7402 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7403 (VBLENDPSYrri VR256:$src1,
7404 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7405 VR128:$src2, sub_xmm), 0xf)>;
7406 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7407 (VBLENDPSYrri VR256:$src1,
7408 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7409 VR128:$src2, sub_xmm), 0xf)>;
7410 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7411 (VBLENDPSYrri VR256:$src1,
7412 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7413 VR128:$src2, sub_xmm), 0xf)>;
7414 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7415 (VBLENDPSYrri VR256:$src1,
7416 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7417 VR128:$src2, sub_xmm), 0xf)>;
7419 def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7420 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7421 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7422 def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7423 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7424 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7425 def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7426 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7427 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7428 def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7429 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7430 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7433 //===----------------------------------------------------------------------===//
7434 // VPBROADCAST - Load from memory and broadcast to all elements of the
7435 // destination operand
7437 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7438 X86MemOperand x86memop, PatFrag bcast_frag,
7439 ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7440 let Predicates = [HasAVX2, prd] in {
7441 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7442 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7444 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7445 Sched<[SchedWriteShuffle.XMM]>, VEX;
7446 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7447 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7449 (OpVT128 (bcast_frag addr:$src)))]>,
7450 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7451 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7452 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7454 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7455 Sched<[WriteShuffle256]>, VEX, VEX_L;
7456 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7457 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7459 (OpVT256 (bcast_frag addr:$src)))]>,
7460 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7462 // Provide aliases for broadcast from the same register class that
7463 // automatically does the extract.
7464 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7465 (!cast<Instruction>(NAME#"Yrr")
7466 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7470 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7471 v16i8, v32i8, NoVLX_Or_NoBWI>;
7472 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7473 v8i16, v16i16, NoVLX_Or_NoBWI>;
7474 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7475 v4i32, v8i32, NoVLX>;
7476 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7477 v2i64, v4i64, NoVLX>;
7479 let Predicates = [HasAVX2, NoVLX] in {
7480 // Provide fallback in case the load node that is used in the patterns above
7481 // is used by additional users, which prevents the pattern selection.
7482 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7483 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7484 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7485 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7486 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7487 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7490 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7491 def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7492 (VPBROADCASTBrr (VMOVDI2PDIrr
7493 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7494 GR8:$src, sub_8bit))))>;
7495 def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7496 (VPBROADCASTBYrr (VMOVDI2PDIrr
7497 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7498 GR8:$src, sub_8bit))))>;
7500 def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7501 (VPBROADCASTWrr (VMOVDI2PDIrr
7502 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7503 GR16:$src, sub_16bit))))>;
7504 def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7505 (VPBROADCASTWYrr (VMOVDI2PDIrr
7506 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7507 GR16:$src, sub_16bit))))>;
7509 let Predicates = [HasAVX2, NoVLX] in {
7510 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7511 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7512 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7513 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7514 def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7515 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7516 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7517 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7520 // AVX1 broadcast patterns
7521 let Predicates = [HasAVX1Only] in {
7522 def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7523 (VBROADCASTSSYrm addr:$src)>;
7524 def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7525 (VBROADCASTSDYrm addr:$src)>;
7526 def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7527 (VBROADCASTSSrm addr:$src)>;
7530 // Provide fallback in case the load node that is used in the patterns above
7531 // is used by additional users, which prevents the pattern selection.
7532 let Predicates = [HasAVX, NoVLX] in {
7533 // 128bit broadcasts:
7534 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7535 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7536 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7537 (VMOVDDUPrm addr:$src)>;
7539 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7540 (VMOVDDUPrr VR128:$src)>;
7543 let Predicates = [HasAVX1Only] in {
7544 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7545 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7546 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7547 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7548 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7549 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7550 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7551 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7552 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7553 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7555 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7556 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7557 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7558 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7559 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7560 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7561 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7562 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7563 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7564 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7566 def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7567 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7568 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7569 (VMOVDDUPrm addr:$src)>;
7572 //===----------------------------------------------------------------------===//
7573 // VPERM - Permute instructions
7576 multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7577 ValueType OpVT, X86FoldableSchedWrite Sched,
7578 X86MemOperand memOp> {
7579 let Predicates = [HasAVX2, NoVLX] in {
7580 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7581 (ins VR256:$src1, VR256:$src2),
7582 !strconcat(OpcodeStr,
7583 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7585 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7586 Sched<[Sched]>, VEX_4V, VEX_L;
7587 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7588 (ins VR256:$src1, memOp:$src2),
7589 !strconcat(OpcodeStr,
7590 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7592 (OpVT (X86VPermv VR256:$src1,
7593 (load addr:$src2))))]>,
7594 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7598 defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7599 let ExeDomain = SSEPackedSingle in
7600 defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7602 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7603 ValueType OpVT, X86FoldableSchedWrite Sched,
7604 X86MemOperand memOp> {
7605 let Predicates = [HasAVX2, NoVLX] in {
7606 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7607 (ins VR256:$src1, u8imm:$src2),
7608 !strconcat(OpcodeStr,
7609 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7611 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7612 Sched<[Sched]>, VEX, VEX_L;
7613 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7614 (ins memOp:$src1, u8imm:$src2),
7615 !strconcat(OpcodeStr,
7616 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7618 (OpVT (X86VPermi (mem_frag addr:$src1),
7619 (i8 timm:$src2))))]>,
7620 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7624 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7625 WriteShuffle256, i256mem>, VEX_W;
7626 let ExeDomain = SSEPackedDouble in
7627 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7628 WriteFShuffle256, f256mem>, VEX_W;
7630 //===----------------------------------------------------------------------===//
7631 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7633 let isCommutable = 1 in
7634 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7635 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7636 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7637 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7638 (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
7640 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7641 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7642 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7643 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
7644 (i8 timm:$src3)))]>,
7645 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7647 let Predicates = [HasAVX2] in
7648 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7649 VR256:$src1, (i8 timm:$imm))),
7650 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7653 //===----------------------------------------------------------------------===//
7654 // VINSERTI128 - Insert packed integer values
7656 let hasSideEffects = 0 in {
7657 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7658 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7659 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7660 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7662 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7663 (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7664 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7665 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7668 let Predicates = [HasAVX2, NoVLX] in {
7669 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
7670 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>;
7671 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
7672 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>;
7675 //===----------------------------------------------------------------------===//
7676 // VEXTRACTI128 - Extract packed integer values
7678 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7679 (ins VR256:$src1, u8imm:$src2),
7680 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7681 Sched<[WriteShuffle256]>, VEX, VEX_L;
7682 let hasSideEffects = 0, mayStore = 1 in
7683 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7684 (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7685 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7686 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7688 let Predicates = [HasAVX2, NoVLX] in {
7689 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
7690 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
7691 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7692 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
7695 //===----------------------------------------------------------------------===//
7696 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7698 multiclass avx2_pmovmask<string OpcodeStr,
7699 Intrinsic IntLd128, Intrinsic IntLd256,
7700 Intrinsic IntSt128, Intrinsic IntSt256,
7701 X86SchedWriteMaskMove schedX,
7702 X86SchedWriteMaskMove schedY> {
7703 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7704 (ins VR128:$src1, i128mem:$src2),
7705 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7706 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7707 VEX_4V, Sched<[schedX.RM]>;
7708 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7709 (ins VR256:$src1, i256mem:$src2),
7710 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7711 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7712 VEX_4V, VEX_L, Sched<[schedY.RM]>;
7713 def mr : AVX28I<0x8e, MRMDestMem, (outs),
7714 (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7715 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7716 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7717 VEX_4V, Sched<[schedX.MR]>;
7718 def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7719 (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7720 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7721 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7722 VEX_4V, VEX_L, Sched<[schedY.MR]>;
7725 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7726 int_x86_avx2_maskload_d,
7727 int_x86_avx2_maskload_d_256,
7728 int_x86_avx2_maskstore_d,
7729 int_x86_avx2_maskstore_d_256,
7730 WriteVecMaskMove32, WriteVecMaskMove32Y>;
7731 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7732 int_x86_avx2_maskload_q,
7733 int_x86_avx2_maskload_q_256,
7734 int_x86_avx2_maskstore_q,
7735 int_x86_avx2_maskstore_q_256,
7736 WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
7738 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7741 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7742 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7744 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7745 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7746 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7747 (VT immAllZerosV))),
7748 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7750 let Predicates = [HasAVX] in {
7751 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7752 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7753 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7754 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7756 let Predicates = [HasAVX1Only] in {
7757 // load/store i32/i64 not supported use ps/pd version
7758 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7759 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7760 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7761 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7763 let Predicates = [HasAVX2] in {
7764 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7765 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7766 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7767 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7770 //===----------------------------------------------------------------------===//
7771 // SubVector Broadcasts
7772 // Provide fallback in case the load node that is used in the patterns above
7773 // is used by additional users, which prevents the pattern selection.
7775 let Predicates = [HasAVX, NoVLX] in {
7776 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
7777 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7778 (v2f64 VR128:$src), 1)>;
7779 def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
7780 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7781 (v4f32 VR128:$src), 1)>;
7784 // NOTE: We're using FP instructions here, but execution domain fixing can
7785 // convert to integer when profitable.
7786 let Predicates = [HasAVX, NoVLX] in {
7787 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
7788 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7789 (v2i64 VR128:$src), 1)>;
7790 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
7791 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7792 (v4i32 VR128:$src), 1)>;
7793 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
7794 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7795 (v8i16 VR128:$src), 1)>;
7796 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
7797 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7798 (v16i8 VR128:$src), 1)>;
7801 //===----------------------------------------------------------------------===//
7802 // Variable Bit Shifts
7804 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7805 ValueType vt128, ValueType vt256> {
7806 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7807 (ins VR128:$src1, VR128:$src2),
7808 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7810 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7811 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7812 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7813 (ins VR128:$src1, i128mem:$src2),
7814 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7816 (vt128 (OpNode VR128:$src1,
7817 (vt128 (load addr:$src2)))))]>,
7818 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7819 SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7820 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7821 (ins VR256:$src1, VR256:$src2),
7822 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7824 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7825 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7826 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7827 (ins VR256:$src1, i256mem:$src2),
7828 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7830 (vt256 (OpNode VR256:$src1,
7831 (vt256 (load addr:$src2)))))]>,
7832 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7833 SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7836 let Predicates = [HasAVX2, NoVLX] in {
7837 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7838 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7839 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7840 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
7841 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
7844 //===----------------------------------------------------------------------===//
7845 // VGATHER - GATHER Operations
7847 // FIXME: Improve scheduling of gather instructions.
7848 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
7849 ValueType VTy, RegisterClass RC256,
7850 X86MemOperand memop128, X86MemOperand memop256,
7851 ValueType MTx = VTx, ValueType MTy = VTy> {
7852 let mayLoad = 1, hasSideEffects = 0 in {
7853 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
7854 (ins VR128:$src1, memop128:$src2, VR128:$mask),
7855 !strconcat(OpcodeStr,
7856 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7857 []>, VEX, Sched<[WriteLoad]>;
7858 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
7859 (ins RC256:$src1, memop256:$src2, RC256:$mask),
7860 !strconcat(OpcodeStr,
7861 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7862 []>, VEX, VEX_L, Sched<[WriteLoad]>;
7866 let Predicates = [HasAVX2] in {
7867 let mayLoad = 1, hasSideEffects = 0, Constraints
7868 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
7870 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64,
7871 VR256, vx128mem, vx256mem>, VEX_W;
7872 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64,
7873 VR256, vx128mem, vy256mem>, VEX_W;
7874 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32,
7875 VR256, vx128mem, vy256mem>;
7876 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32,
7877 VR128, vx64mem, vy128mem>;
7879 let ExeDomain = SSEPackedDouble in {
7880 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64,
7881 VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W;
7882 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64,
7883 VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W;
7886 let ExeDomain = SSEPackedSingle in {
7887 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32,
7888 VR256, vx128mem, vy256mem, v4i32, v8i32>;
7889 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32,
7890 VR128, vx64mem, vy128mem, v4i32, v4i32>;
7895 //===----------------------------------------------------------------------===//
7896 // GFNI instructions
7897 //===----------------------------------------------------------------------===//
7899 multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
7900 RegisterClass RC, PatFrag MemOpFrag,
7901 X86MemOperand X86MemOp, bit Is2Addr = 0> {
7902 let ExeDomain = SSEPackedInt,
7903 AsmString = !if(Is2Addr,
7904 OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
7905 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7906 let isCommutable = 1 in
7907 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
7908 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
7909 Sched<[SchedWriteVecALU.XMM]>, T8PD;
7911 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
7912 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
7913 (MemOpFrag addr:$src2))))]>,
7914 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
7918 multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
7919 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
7920 X86MemOperand X86MemOp, bit Is2Addr = 0> {
7921 let AsmString = !if(Is2Addr,
7922 OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7923 OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
7924 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
7925 (ins RC:$src1, RC:$src2, u8imm:$src3), "",
7926 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
7927 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
7928 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
7929 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
7930 [(set RC:$dst, (OpVT (OpNode RC:$src1,
7931 (MemOpFrag addr:$src2),
7932 timm:$src3)))], SSEPackedInt>,
7933 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
7937 multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
7938 let Constraints = "$src1 = $dst",
7939 Predicates = [HasGFNI, UseSSE2] in
7940 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
7941 VR128, load, i128mem, 1>;
7942 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7943 defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
7944 load, i128mem>, VEX_4V, VEX_W;
7945 defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
7946 load, i256mem>, VEX_4V, VEX_L, VEX_W;
7951 let Constraints = "$src1 = $dst",
7952 Predicates = [HasGFNI, UseSSE2] in
7953 defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
7955 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7956 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
7958 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
7959 i256mem>, VEX_4V, VEX_L;
7961 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
7962 let isCommutable = 0 in {
7963 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
7964 X86GF2P8affineinvqb>, TAPD;
7965 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
7966 X86GF2P8affineqb>, TAPD;