1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file describes the X86 SSE instruction set, defining the instructions,
11 // and properties of the instructions which are needed for code generation,
12 // machine code emission, and analysis.
14 //===----------------------------------------------------------------------===//
16 //===----------------------------------------------------------------------===//
17 // SSE 1 & 2 Instructions Classes
18 //===----------------------------------------------------------------------===//
20 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
21 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
22 RegisterClass RC, X86MemOperand x86memop,
23 Domain d, X86FoldableSchedWrite sched,
25 let isCommutable = 1 in {
26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38 Sched<[sched.Folded, sched.ReadAfterFold]>;
41 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
42 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
43 SDPatternOperator OpNode, RegisterClass RC,
44 ValueType VT, string asm, Operand memopr,
45 ComplexPattern mem_cpat, Domain d,
46 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
47 let isCodeGenOnly = 1, hasSideEffects = 0 in {
48 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
50 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
51 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
52 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
55 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
57 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
58 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
59 [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
60 Sched<[sched.Folded, sched.ReadAfterFold]>;
64 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
65 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
66 RegisterClass RC, ValueType vt,
67 X86MemOperand x86memop, PatFrag mem_frag,
68 Domain d, X86FoldableSchedWrite sched,
70 let isCommutable = 1 in
71 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
73 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
74 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
75 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
78 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
80 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
81 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
82 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
84 Sched<[sched.Folded, sched.ReadAfterFold]>;
87 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
88 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
89 string OpcodeStr, X86MemOperand x86memop,
90 X86FoldableSchedWrite sched,
91 list<dag> pat_rr, list<dag> pat_rm,
93 let isCommutable = 1, hasSideEffects = 0 in
94 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
96 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
97 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
100 let hasSideEffects = 0, mayLoad = 1 in
101 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
103 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
104 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106 Sched<[sched.Folded, sched.ReadAfterFold]>;
110 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
111 // This is expanded by ExpandPostRAPseudos.
112 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
113 isPseudo = 1, SchedRW = [WriteZero] in {
114 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
115 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
116 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
117 [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
120 //===----------------------------------------------------------------------===//
121 // AVX & SSE - Zero/One Vectors
122 //===----------------------------------------------------------------------===//
124 // Alias instruction that maps zero vector to pxor / xorp* for sse.
125 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
126 // swizzled by ExecutionDomainFix to pxor.
127 // We set canFoldAsLoad because this can be converted to a constant-pool
128 // load of an all-zeros value if folding it would be beneficial.
129 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
130 isPseudo = 1, SchedRW = [WriteZero] in {
131 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
132 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
135 let Predicates = [NoAVX512] in
136 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
139 // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
140 // and doesn't need it because on sandy bridge the register is set to zero
141 // at the rename stage without using any execution unit, so SET0PSY
142 // and SET0PDY can be used for vector int instructions without penalty
143 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
144 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
145 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
146 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
149 // We set canFoldAsLoad because this can be converted to a constant-pool
150 // load of an all-ones value if folding it would be beneficial.
151 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
152 isPseudo = 1, SchedRW = [WriteZero] in {
153 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
154 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
155 let Predicates = [HasAVX1Only, OptForMinSize] in {
156 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
157 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
159 let Predicates = [HasAVX2] in
160 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
161 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
164 //===----------------------------------------------------------------------===//
165 // SSE 1 & 2 - Move FP Scalar Instructions
167 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
168 // register copies because it's a partial register update; Register-to-register
169 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
170 // that the insert be implementable in terms of a copy, and just mentioned, we
171 // don't use movss/movsd for copies.
172 //===----------------------------------------------------------------------===//
174 multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
175 X86MemOperand x86memop, string base_opc,
176 string asm_opr, Domain d, string Name> {
177 let isCommutable = 1 in
178 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
179 (ins VR128:$src1, VR128:$src2),
180 !strconcat(base_opc, asm_opr),
181 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
182 Sched<[SchedWriteFShuffle.XMM]>;
184 // For the disassembler
185 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
186 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
187 (ins VR128:$src1, VR128:$src2),
188 !strconcat(base_opc, asm_opr), []>,
189 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
192 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
193 X86MemOperand x86memop, string OpcodeStr,
194 Domain d, string Name, Predicate pred> {
196 let Predicates = [UseAVX, OptForSize] in
197 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
198 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
200 VEX_4V, VEX_LIG, VEX_WIG;
202 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
203 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
204 [(store RC:$src, addr:$dst)], d>,
205 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
207 let Constraints = "$src1 = $dst" in {
208 let Predicates = [pred, NoSSE41_Or_OptForSize] in
209 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
210 "\t{$src2, $dst|$dst, $src2}", d, Name>;
213 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
214 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
215 [(store RC:$src, addr:$dst)], d>,
216 Sched<[WriteFStore]>;
218 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
219 (!cast<Instruction>("V"#NAME#"rr_REV")
220 VR128:$dst, VR128:$src1, VR128:$src2), 0>;
221 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
222 (!cast<Instruction>(NAME#"rr_REV")
223 VR128:$dst, VR128:$src2), 0>;
226 // Loading from memory automatically zeroing upper bits.
227 multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
228 PatFrag mem_pat, string OpcodeStr, Domain d> {
229 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
230 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
231 [(set RC:$dst, (mem_pat addr:$src))], d>,
232 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
233 def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
234 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
235 [(set RC:$dst, (mem_pat addr:$src))], d>,
239 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
240 SSEPackedSingle, "MOVSS", UseSSE1>, XS;
241 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
242 SSEPackedDouble, "MOVSD", UseSSE2>, XD;
244 let canFoldAsLoad = 1, isReMaterializable = 1 in {
245 defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
246 SSEPackedSingle>, XS;
247 defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
248 SSEPackedDouble>, XD;
252 let Predicates = [UseAVX] in {
253 // MOVSSrm zeros the high parts of the register; represent this
254 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
255 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
256 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
257 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
258 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
259 def : Pat<(v4f32 (X86vzload addr:$src)),
260 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
262 // MOVSDrm zeros the high parts of the register; represent this
263 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
264 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
265 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
266 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
267 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
268 def : Pat<(v2f64 (X86vzload addr:$src)),
269 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
271 // Represent the same patterns above but in the form they appear for
273 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
274 (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
275 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
276 def : Pat<(v8f32 (X86vzload addr:$src)),
277 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
278 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
279 (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
280 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
281 def : Pat<(v4f64 (X86vzload addr:$src)),
282 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
284 // Extract and store.
285 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
287 (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
290 let Predicates = [UseAVX, OptForSize] in {
291 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
292 // MOVSS to the lower bits.
293 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
294 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
295 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
296 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
298 // Move low f32 and clear high bits.
299 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
300 (SUBREG_TO_REG (i32 0),
301 (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
302 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
303 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
304 (SUBREG_TO_REG (i32 0),
305 (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
306 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
308 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
309 (SUBREG_TO_REG (i32 0),
310 (v2f64 (VMOVSDrr (v2f64 (V_SET0)),
311 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
313 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
314 (SUBREG_TO_REG (i32 0),
315 (v2i64 (VMOVSDrr (v2i64 (V_SET0)),
316 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
320 let Predicates = [UseSSE1] in {
321 let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
322 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
323 // MOVSS to the lower bits.
324 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
325 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
326 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
327 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
330 // MOVSSrm already zeros the high parts of the register.
331 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
332 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
333 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
334 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
335 def : Pat<(v4f32 (X86vzload addr:$src)),
336 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
338 // Extract and store.
339 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
341 (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
344 let Predicates = [UseSSE2] in {
345 // MOVSDrm already zeros the high parts of the register.
346 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
347 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
348 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
349 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
350 def : Pat<(v2f64 (X86vzload addr:$src)),
351 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
354 // Aliases to help the assembler pick two byte VEX encodings by swapping the
355 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
356 def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
357 (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
358 def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
359 (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
361 //===----------------------------------------------------------------------===//
362 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
363 //===----------------------------------------------------------------------===//
365 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
366 X86MemOperand x86memop, PatFrag ld_frag,
367 string asm, Domain d,
368 X86SchedWriteMoveLS sched> {
369 let hasSideEffects = 0, isMoveReg = 1 in
370 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
371 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
373 let canFoldAsLoad = 1, isReMaterializable = 1 in
374 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
375 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
376 [(set RC:$dst, (ld_frag addr:$src))], d>,
380 let Predicates = [HasAVX, NoVLX] in {
381 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
382 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
384 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
385 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
387 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
388 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
390 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
391 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
394 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
395 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
396 PS, VEX, VEX_L, VEX_WIG;
397 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
398 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
399 PD, VEX, VEX_L, VEX_WIG;
400 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
401 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
402 PS, VEX, VEX_L, VEX_WIG;
403 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
404 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
405 PD, VEX, VEX_L, VEX_WIG;
408 let Predicates = [UseSSE1] in {
409 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
410 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
412 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
413 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
416 let Predicates = [UseSSE2] in {
417 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
418 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
420 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
421 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
425 let Predicates = [HasAVX, NoVLX] in {
426 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
427 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
428 "movaps\t{$src, $dst|$dst, $src}",
429 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
431 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
432 "movapd\t{$src, $dst|$dst, $src}",
433 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
435 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
436 "movups\t{$src, $dst|$dst, $src}",
437 [(store (v4f32 VR128:$src), addr:$dst)]>,
439 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
440 "movupd\t{$src, $dst|$dst, $src}",
441 [(store (v2f64 VR128:$src), addr:$dst)]>,
445 let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
446 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
447 "movaps\t{$src, $dst|$dst, $src}",
448 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
450 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
451 "movapd\t{$src, $dst|$dst, $src}",
452 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
454 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
455 "movups\t{$src, $dst|$dst, $src}",
456 [(store (v8f32 VR256:$src), addr:$dst)]>,
458 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
459 "movupd\t{$src, $dst|$dst, $src}",
460 [(store (v4f64 VR256:$src), addr:$dst)]>,
466 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
468 let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
469 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
471 "movaps\t{$src, $dst|$dst, $src}", []>,
472 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
473 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
475 "movapd\t{$src, $dst|$dst, $src}", []>,
476 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
477 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
479 "movups\t{$src, $dst|$dst, $src}", []>,
480 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
481 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
483 "movupd\t{$src, $dst|$dst, $src}", []>,
484 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
487 let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
488 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
490 "movaps\t{$src, $dst|$dst, $src}", []>,
491 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
492 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
494 "movapd\t{$src, $dst|$dst, $src}", []>,
495 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
496 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
498 "movups\t{$src, $dst|$dst, $src}", []>,
499 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
500 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
502 "movupd\t{$src, $dst|$dst, $src}", []>,
503 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
507 // Aliases to help the assembler pick two byte VEX encodings by swapping the
508 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
509 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
510 (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
511 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
512 (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
513 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
514 (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
515 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
516 (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
517 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
518 (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
519 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
520 (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
521 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
522 (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
523 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
524 (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
526 // Reversed version with ".s" suffix for GAS compatibility.
527 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
528 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
529 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
530 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
531 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
532 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
533 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
534 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
535 def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
536 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
537 def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
538 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
539 def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
540 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
541 def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
542 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
544 let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
545 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
546 "movaps\t{$src, $dst|$dst, $src}",
547 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
548 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
549 "movapd\t{$src, $dst|$dst, $src}",
550 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
551 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
552 "movups\t{$src, $dst|$dst, $src}",
553 [(store (v4f32 VR128:$src), addr:$dst)]>;
554 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
555 "movupd\t{$src, $dst|$dst, $src}",
556 [(store (v2f64 VR128:$src), addr:$dst)]>;
560 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
561 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
562 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
563 "movaps\t{$src, $dst|$dst, $src}", []>,
564 FoldGenData<"MOVAPSrr">;
565 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
566 "movapd\t{$src, $dst|$dst, $src}", []>,
567 FoldGenData<"MOVAPDrr">;
568 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
569 "movups\t{$src, $dst|$dst, $src}", []>,
570 FoldGenData<"MOVUPSrr">;
571 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
572 "movupd\t{$src, $dst|$dst, $src}", []>,
573 FoldGenData<"MOVUPDrr">;
576 // Reversed version with ".s" suffix for GAS compatibility.
577 def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
578 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
579 def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
580 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
581 def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
582 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
583 def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
584 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
586 let Predicates = [HasAVX, NoVLX] in {
587 // 256-bit load/store need to use floating point load/store in case we don't
588 // have AVX2. Execution domain fixing will convert to integer if AVX2 is
589 // available and changing the domain is beneficial.
590 def : Pat<(alignedloadv4i64 addr:$src),
591 (VMOVAPSYrm addr:$src)>;
592 def : Pat<(alignedloadv8i32 addr:$src),
593 (VMOVAPSYrm addr:$src)>;
594 def : Pat<(alignedloadv16i16 addr:$src),
595 (VMOVAPSYrm addr:$src)>;
596 def : Pat<(alignedloadv32i8 addr:$src),
597 (VMOVAPSYrm addr:$src)>;
598 def : Pat<(loadv4i64 addr:$src),
599 (VMOVUPSYrm addr:$src)>;
600 def : Pat<(loadv8i32 addr:$src),
601 (VMOVUPSYrm addr:$src)>;
602 def : Pat<(loadv16i16 addr:$src),
603 (VMOVUPSYrm addr:$src)>;
604 def : Pat<(loadv32i8 addr:$src),
605 (VMOVUPSYrm addr:$src)>;
607 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
608 (VMOVAPSYmr addr:$dst, VR256:$src)>;
609 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
610 (VMOVAPSYmr addr:$dst, VR256:$src)>;
611 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
612 (VMOVAPSYmr addr:$dst, VR256:$src)>;
613 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
614 (VMOVAPSYmr addr:$dst, VR256:$src)>;
615 def : Pat<(store (v4i64 VR256:$src), addr:$dst),
616 (VMOVUPSYmr addr:$dst, VR256:$src)>;
617 def : Pat<(store (v8i32 VR256:$src), addr:$dst),
618 (VMOVUPSYmr addr:$dst, VR256:$src)>;
619 def : Pat<(store (v16i16 VR256:$src), addr:$dst),
620 (VMOVUPSYmr addr:$dst, VR256:$src)>;
621 def : Pat<(store (v32i8 VR256:$src), addr:$dst),
622 (VMOVUPSYmr addr:$dst, VR256:$src)>;
625 // Use movaps / movups for SSE integer load / store (one byte shorter).
626 // The instructions selected below are then converted to MOVDQA/MOVDQU
627 // during the SSE domain pass.
628 let Predicates = [UseSSE1] in {
629 def : Pat<(alignedloadv2i64 addr:$src),
630 (MOVAPSrm addr:$src)>;
631 def : Pat<(alignedloadv4i32 addr:$src),
632 (MOVAPSrm addr:$src)>;
633 def : Pat<(alignedloadv8i16 addr:$src),
634 (MOVAPSrm addr:$src)>;
635 def : Pat<(alignedloadv16i8 addr:$src),
636 (MOVAPSrm addr:$src)>;
637 def : Pat<(loadv2i64 addr:$src),
638 (MOVUPSrm addr:$src)>;
639 def : Pat<(loadv4i32 addr:$src),
640 (MOVUPSrm addr:$src)>;
641 def : Pat<(loadv8i16 addr:$src),
642 (MOVUPSrm addr:$src)>;
643 def : Pat<(loadv16i8 addr:$src),
644 (MOVUPSrm addr:$src)>;
646 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
647 (MOVAPSmr addr:$dst, VR128:$src)>;
648 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
649 (MOVAPSmr addr:$dst, VR128:$src)>;
650 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
651 (MOVAPSmr addr:$dst, VR128:$src)>;
652 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
653 (MOVAPSmr addr:$dst, VR128:$src)>;
654 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
655 (MOVUPSmr addr:$dst, VR128:$src)>;
656 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
657 (MOVUPSmr addr:$dst, VR128:$src)>;
658 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
659 (MOVUPSmr addr:$dst, VR128:$src)>;
660 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
661 (MOVUPSmr addr:$dst, VR128:$src)>;
664 //===----------------------------------------------------------------------===//
665 // SSE 1 & 2 - Move Low packed FP Instructions
666 //===----------------------------------------------------------------------===//
668 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
669 string base_opc, string asm_opr> {
670 // No pattern as they need be special cased between high and low.
671 let hasSideEffects = 0, mayLoad = 1 in
672 def PSrm : PI<opc, MRMSrcMem,
673 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
674 !strconcat(base_opc, "s", asm_opr),
675 [], SSEPackedSingle>, PS,
676 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
678 def PDrm : PI<opc, MRMSrcMem,
679 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
680 !strconcat(base_opc, "d", asm_opr),
681 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
682 (scalar_to_vector (loadf64 addr:$src2)))))],
683 SSEPackedDouble>, PD,
684 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
687 multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
689 let Predicates = [UseAVX] in
690 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
691 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
694 let Constraints = "$src1 = $dst" in
695 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
696 "\t{$src2, $dst|$dst, $src2}">;
699 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
701 let SchedRW = [WriteFStore] in {
702 let Predicates = [UseAVX] in {
703 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
704 "movlps\t{$src, $dst|$dst, $src}",
705 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
706 (iPTR 0))), addr:$dst)]>,
708 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
709 "movlpd\t{$src, $dst|$dst, $src}",
710 [(store (f64 (extractelt (v2f64 VR128:$src),
711 (iPTR 0))), addr:$dst)]>,
714 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
715 "movlps\t{$src, $dst|$dst, $src}",
716 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
717 (iPTR 0))), addr:$dst)]>;
718 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
719 "movlpd\t{$src, $dst|$dst, $src}",
720 [(store (f64 (extractelt (v2f64 VR128:$src),
721 (iPTR 0))), addr:$dst)]>;
724 let Predicates = [UseSSE1] in {
725 // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
726 def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
727 (iPTR 0))), addr:$src1),
728 (MOVLPSmr addr:$src1, VR128:$src2)>;
730 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
731 // end up with a movsd or blend instead of shufp.
732 // No need for aligned load, we're only loading 64-bits.
733 def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)),
734 (MOVLPSrm VR128:$src1, addr:$src2)>;
737 //===----------------------------------------------------------------------===//
738 // SSE 1 & 2 - Move Hi packed FP Instructions
739 //===----------------------------------------------------------------------===//
741 defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
743 let SchedRW = [WriteFStore] in {
744 // v2f64 extract element 1 is always custom lowered to unpack high to low
745 // and extract element 0 so the non-store version isn't too horrible.
746 let Predicates = [UseAVX] in {
747 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
748 "movhps\t{$src, $dst|$dst, $src}",
749 [(store (f64 (extractelt
750 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
751 (bc_v2f64 (v4f32 VR128:$src))),
752 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
753 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
754 "movhpd\t{$src, $dst|$dst, $src}",
755 [(store (f64 (extractelt
756 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
757 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
759 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
760 "movhps\t{$src, $dst|$dst, $src}",
761 [(store (f64 (extractelt
762 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
763 (bc_v2f64 (v4f32 VR128:$src))),
764 (iPTR 0))), addr:$dst)]>;
765 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
766 "movhpd\t{$src, $dst|$dst, $src}",
767 [(store (f64 (extractelt
768 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
769 (iPTR 0))), addr:$dst)]>;
772 let Predicates = [UseAVX] in {
773 // Also handle an i64 load because that may get selected as a faster way to
775 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
776 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
777 (VMOVHPDrm VR128:$src1, addr:$src2)>;
779 def : Pat<(store (f64 (extractelt
780 (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
781 (iPTR 0))), addr:$dst),
782 (VMOVHPDmr addr:$dst, VR128:$src)>;
785 let Predicates = [UseSSE1] in {
786 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
787 // end up with a movsd or blend instead of shufp.
788 // No need for aligned load, we're only loading 64-bits.
789 def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)),
790 (MOVHPSrm VR128:$src1, addr:$src2)>;
793 let Predicates = [UseSSE2] in {
796 // Also handle an i64 load because that may get selected as a faster way to
798 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
799 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
800 (MOVHPDrm VR128:$src1, addr:$src2)>;
802 def : Pat<(store (f64 (extractelt
803 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
804 (iPTR 0))), addr:$dst),
805 (MOVHPDmr addr:$dst, VR128:$src)>;
808 //===----------------------------------------------------------------------===//
809 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
810 //===----------------------------------------------------------------------===//
812 let Predicates = [UseAVX] in {
813 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
814 (ins VR128:$src1, VR128:$src2),
815 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
817 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
818 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
819 let isCommutable = 1 in
820 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
821 (ins VR128:$src1, VR128:$src2),
822 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
824 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
825 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
828 let Constraints = "$src1 = $dst" in {
829 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
830 (ins VR128:$src1, VR128:$src2),
831 "movlhps\t{$src2, $dst|$dst, $src2}",
833 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
834 Sched<[SchedWriteFShuffle.XMM]>;
835 let isCommutable = 1 in
836 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
837 (ins VR128:$src1, VR128:$src2),
838 "movhlps\t{$src2, $dst|$dst, $src2}",
840 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
841 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
844 //===----------------------------------------------------------------------===//
845 // SSE 1 & 2 - Conversion Instructions
846 //===----------------------------------------------------------------------===//
848 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
849 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
850 string asm, X86FoldableSchedWrite sched> {
851 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
852 [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
854 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
855 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
856 Sched<[sched.Folded]>;
859 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
860 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
861 string asm, Domain d, X86FoldableSchedWrite sched> {
862 let hasSideEffects = 0 in {
863 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
864 [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
867 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
868 [(set RC:$dst, (DstTy (sint_to_fp
869 (SrcTy (ld_frag addr:$src)))))], d>,
870 Sched<[sched.Folded]>;
874 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
875 X86MemOperand x86memop, string asm,
876 X86FoldableSchedWrite sched> {
877 let hasSideEffects = 0, Predicates = [UseAVX] in {
878 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
879 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
882 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
883 (ins DstRC:$src1, x86memop:$src),
884 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
885 Sched<[sched.Folded, sched.ReadAfterFold]>;
886 } // hasSideEffects = 0
889 let Predicates = [UseAVX] in {
890 defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
891 "cvttss2si\t{$src, $dst|$dst, $src}",
894 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
895 "cvttss2si\t{$src, $dst|$dst, $src}",
897 XS, VEX, VEX_W, VEX_LIG;
898 defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
899 "cvttsd2si\t{$src, $dst|$dst, $src}",
902 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
903 "cvttsd2si\t{$src, $dst|$dst, $src}",
905 XD, VEX, VEX_W, VEX_LIG;
907 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
908 (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
909 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
910 (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
911 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
912 (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
913 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
914 (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
915 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
916 (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
917 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
918 (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
919 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
920 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
921 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
922 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
924 // The assembler can recognize rr 64-bit instructions by seeing a rxx
925 // register, but the same isn't true when only using memory operands,
926 // provide other assembly "l" and "q" forms to address this explicitly
927 // where appropriate to do so.
928 defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}",
929 WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
930 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}",
931 WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
932 defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}",
933 WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
934 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}",
935 WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
937 let Predicates = [UseAVX] in {
938 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
939 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
940 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
941 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">;
943 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
944 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
945 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
946 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
947 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
948 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
949 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
950 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
952 def : Pat<(f32 (sint_to_fp GR32:$src)),
953 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
954 def : Pat<(f32 (sint_to_fp GR64:$src)),
955 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
956 def : Pat<(f64 (sint_to_fp GR32:$src)),
957 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
958 def : Pat<(f64 (sint_to_fp GR64:$src)),
959 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
962 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
963 "cvttss2si\t{$src, $dst|$dst, $src}",
965 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
966 "cvttss2si\t{$src, $dst|$dst, $src}",
967 WriteCvtSS2I>, XS, REX_W;
968 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
969 "cvttsd2si\t{$src, $dst|$dst, $src}",
971 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
972 "cvttsd2si\t{$src, $dst|$dst, $src}",
973 WriteCvtSD2I>, XD, REX_W;
974 defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
975 "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
977 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
978 "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
979 WriteCvtI2SS>, XS, REX_W;
980 defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
981 "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
983 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
984 "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
985 WriteCvtI2SD>, XD, REX_W;
987 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
988 (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">;
989 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
990 (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">;
991 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
992 (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">;
993 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
994 (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">;
995 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
996 (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">;
997 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
998 (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">;
999 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1000 (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">;
1001 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1002 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">;
1004 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1005 (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">;
1006 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1007 (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">;
1009 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
1010 // and/or XMM operand(s).
1012 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1013 ValueType DstVT, ValueType SrcVT, SDNode OpNode,
1014 Operand memop, ComplexPattern mem_cpat, string asm,
1015 X86FoldableSchedWrite sched> {
1016 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1017 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1018 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1020 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1021 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1022 [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
1023 Sched<[sched.Folded]>;
1026 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1027 RegisterClass DstRC, X86MemOperand x86memop,
1028 string asm, X86FoldableSchedWrite sched,
1030 let hasSideEffects = 0 in {
1031 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1033 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1034 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1035 []>, Sched<[sched]>;
1037 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1038 (ins DstRC:$src1, x86memop:$src2),
1040 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1041 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1042 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1046 let Predicates = [UseAVX] in {
1047 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1048 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1049 WriteCvtSD2I>, XD, VEX, VEX_LIG;
1050 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1051 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1052 WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
1054 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1055 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
1056 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1057 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
1060 let isCodeGenOnly = 1 in {
1061 let Predicates = [UseAVX] in {
1062 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1063 i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V;
1064 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1065 i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W;
1066 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1067 i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V;
1068 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1069 i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W;
1071 let Constraints = "$src1 = $dst" in {
1072 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1073 i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS;
1074 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1075 i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W;
1076 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1077 i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD;
1078 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1079 i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W;
1081 } // isCodeGenOnly = 1
1085 // Aliases for intrinsics
1086 let isCodeGenOnly = 1 in {
1087 let Predicates = [UseAVX] in {
1088 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1089 ssmem, sse_load_f32, "cvttss2si",
1090 WriteCvtSS2I>, XS, VEX;
1091 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1092 X86cvtts2Int, ssmem, sse_load_f32,
1093 "cvttss2si", WriteCvtSS2I>,
1095 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1096 sdmem, sse_load_f64, "cvttsd2si",
1097 WriteCvtSS2I>, XD, VEX;
1098 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1099 X86cvtts2Int, sdmem, sse_load_f64,
1100 "cvttsd2si", WriteCvtSS2I>,
1103 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1104 ssmem, sse_load_f32, "cvttss2si",
1106 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1107 X86cvtts2Int, ssmem, sse_load_f32,
1108 "cvttss2si", WriteCvtSS2I>, XS, REX_W;
1109 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1110 sdmem, sse_load_f64, "cvttsd2si",
1112 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1113 X86cvtts2Int, sdmem, sse_load_f64,
1114 "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
1115 } // isCodeGenOnly = 1
1117 let Predicates = [UseAVX] in {
1118 defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1119 ssmem, sse_load_f32, "cvtss2si",
1120 WriteCvtSS2I>, XS, VEX, VEX_LIG;
1121 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1122 ssmem, sse_load_f32, "cvtss2si",
1123 WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
1125 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1126 ssmem, sse_load_f32, "cvtss2si",
1128 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1129 ssmem, sse_load_f32, "cvtss2si",
1130 WriteCvtSS2I>, XS, REX_W;
1132 defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1133 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1134 SSEPackedSingle, WriteCvtI2PS>,
1135 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1136 defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1137 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1138 SSEPackedSingle, WriteCvtI2PSY>,
1139 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1141 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1142 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1143 SSEPackedSingle, WriteCvtI2PS>,
1144 PS, Requires<[UseSSE2]>;
1146 let Predicates = [UseAVX] in {
1147 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1148 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1149 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1150 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1151 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1152 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1153 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1154 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1155 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1156 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1157 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1158 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1159 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1160 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1161 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1162 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1165 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1166 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1167 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1168 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1169 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1170 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1171 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1172 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1173 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1174 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1175 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1176 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1177 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1178 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1179 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1180 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1184 // Convert scalar double to scalar single
1185 let hasSideEffects = 0, Predicates = [UseAVX] in {
1186 def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1187 (ins FR32:$src1, FR64:$src2),
1188 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1189 VEX_4V, VEX_LIG, VEX_WIG,
1190 Sched<[WriteCvtSD2SS]>;
1192 def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1193 (ins FR32:$src1, f64mem:$src2),
1194 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1195 XD, VEX_4V, VEX_LIG, VEX_WIG,
1196 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1199 def : Pat<(f32 (fpround FR64:$src)),
1200 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1203 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1204 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1205 [(set FR32:$dst, (fpround FR64:$src))]>,
1206 Sched<[WriteCvtSD2SS]>;
1207 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1208 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1209 [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
1210 XD, Requires<[UseSSE2, OptForSize]>,
1211 Sched<[WriteCvtSD2SS.Folded]>;
1213 let isCodeGenOnly = 1 in {
1214 def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1215 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1216 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1218 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
1219 XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
1220 Sched<[WriteCvtSD2SS]>;
1221 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1222 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1223 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1224 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1225 VR128:$src1, sse_load_f64:$src2))]>,
1226 XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
1227 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1228 let Constraints = "$src1 = $dst" in {
1229 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1230 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1231 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1233 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>,
1234 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1235 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1236 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1237 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1238 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1239 VR128:$src1, sse_load_f64:$src2))]>,
1240 XD, Requires<[UseSSE2]>,
1241 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1243 } // isCodeGenOnly = 1
1245 // Convert scalar single to scalar double
1246 // SSE2 instructions with XS prefix
1247 let hasSideEffects = 0 in {
1248 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1249 (ins FR64:$src1, FR32:$src2),
1250 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1251 XS, VEX_4V, VEX_LIG, VEX_WIG,
1252 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
1254 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1255 (ins FR64:$src1, f32mem:$src2),
1256 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1257 XS, VEX_4V, VEX_LIG, VEX_WIG,
1258 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1259 Requires<[UseAVX, OptForSize]>;
1262 def : Pat<(f64 (fpextend FR32:$src)),
1263 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1264 def : Pat<(fpextend (loadf32 addr:$src)),
1265 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1267 def : Pat<(extloadf32 addr:$src),
1268 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
1269 Requires<[UseAVX, OptForSize]>;
1270 def : Pat<(extloadf32 addr:$src),
1271 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1272 Requires<[UseAVX, OptForSpeed]>;
1274 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1275 "cvtss2sd\t{$src, $dst|$dst, $src}",
1276 [(set FR64:$dst, (fpextend FR32:$src))]>,
1277 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
1278 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1279 "cvtss2sd\t{$src, $dst|$dst, $src}",
1280 [(set FR64:$dst, (extloadf32 addr:$src))]>,
1281 XS, Requires<[UseSSE2, OptForSize]>,
1282 Sched<[WriteCvtSS2SD.Folded]>;
1284 // extload f32 -> f64. This matches load+fpextend because we have a hack in
1285 // the isel (PreprocessForFPConvert) that can introduce loads after dag
1287 // Since these loads aren't folded into the fpextend, we have to match it
1289 def : Pat<(fpextend (loadf32 addr:$src)),
1290 (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>;
1291 def : Pat<(extloadf32 addr:$src),
1292 (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1294 let isCodeGenOnly = 1, hasSideEffects = 0 in {
1295 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1296 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1297 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1298 []>, XS, VEX_4V, VEX_WIG,
1299 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1301 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1302 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1303 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1304 []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>,
1305 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1306 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1307 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1308 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1309 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1310 []>, XS, Requires<[UseSSE2]>,
1311 Sched<[WriteCvtSS2SD]>;
1313 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1314 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1315 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1316 []>, XS, Requires<[UseSSE2]>,
1317 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1319 } // isCodeGenOnly = 1
1321 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1322 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1323 // vmovs{s,d} instructions
1324 let Predicates = [UseAVX] in {
1325 def : Pat<(v4f32 (X86Movss
1327 (v4f32 (scalar_to_vector
1328 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1329 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1331 def : Pat<(v2f64 (X86Movsd
1333 (v2f64 (scalar_to_vector
1334 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1335 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1337 def : Pat<(v4f32 (X86Movss
1339 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1340 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1342 def : Pat<(v4f32 (X86Movss
1344 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1345 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1347 def : Pat<(v4f32 (X86Movss
1349 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1350 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1352 def : Pat<(v4f32 (X86Movss
1354 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1355 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1357 def : Pat<(v2f64 (X86Movsd
1359 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1360 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1362 def : Pat<(v2f64 (X86Movsd
1364 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1365 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1367 def : Pat<(v2f64 (X86Movsd
1369 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1370 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1372 def : Pat<(v2f64 (X86Movsd
1374 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1375 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1376 } // Predicates = [UseAVX]
1378 let Predicates = [UseSSE2] in {
1379 def : Pat<(v4f32 (X86Movss
1381 (v4f32 (scalar_to_vector
1382 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1383 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1385 def : Pat<(v2f64 (X86Movsd
1387 (v2f64 (scalar_to_vector
1388 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1389 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1391 def : Pat<(v2f64 (X86Movsd
1393 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1394 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1396 def : Pat<(v2f64 (X86Movsd
1398 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1399 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1401 def : Pat<(v2f64 (X86Movsd
1403 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1404 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1406 def : Pat<(v2f64 (X86Movsd
1408 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1409 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1410 } // Predicates = [UseSSE2]
1412 let Predicates = [UseSSE1] in {
1413 def : Pat<(v4f32 (X86Movss
1415 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1416 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1418 def : Pat<(v4f32 (X86Movss
1420 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1421 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1423 def : Pat<(v4f32 (X86Movss
1425 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1426 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1428 def : Pat<(v4f32 (X86Movss
1430 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1431 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1432 } // Predicates = [UseSSE1]
1434 let Predicates = [HasAVX, NoVLX] in {
1435 // Convert packed single/double fp to doubleword
1436 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1437 "cvtps2dq\t{$src, $dst|$dst, $src}",
1438 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1439 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1440 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1441 "cvtps2dq\t{$src, $dst|$dst, $src}",
1443 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1444 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1445 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1446 "cvtps2dq\t{$src, $dst|$dst, $src}",
1448 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1449 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1450 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1451 "cvtps2dq\t{$src, $dst|$dst, $src}",
1453 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1454 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1456 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1457 "cvtps2dq\t{$src, $dst|$dst, $src}",
1458 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1459 Sched<[WriteCvtPS2I]>;
1460 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1461 "cvtps2dq\t{$src, $dst|$dst, $src}",
1463 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1464 Sched<[WriteCvtPS2ILd]>;
1467 // Convert Packed Double FP to Packed DW Integers
1468 let Predicates = [HasAVX, NoVLX] in {
1469 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1470 // register, but the same isn't true when using memory operands instead.
1471 // Provide other assembly rr and rm forms to address this explicitly.
1472 def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1473 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1475 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1476 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1479 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1480 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
1481 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1482 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1484 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1485 Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1486 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1487 (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
1490 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1491 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1493 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1494 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1495 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1496 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1498 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1499 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1500 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1501 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
1502 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1503 (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
1506 def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1507 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1509 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1510 Sched<[WriteCvtPD2ILd]>;
1511 def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1512 "cvtpd2dq\t{$src, $dst|$dst, $src}",
1514 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1515 Sched<[WriteCvtPD2I]>;
1517 // Convert with truncation packed single/double fp to doubleword
1518 // SSE2 packed instructions with XS prefix
1519 let Predicates = [HasAVX, NoVLX] in {
1520 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1521 "cvttps2dq\t{$src, $dst|$dst, $src}",
1523 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1524 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1525 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1526 "cvttps2dq\t{$src, $dst|$dst, $src}",
1528 (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
1529 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1530 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1531 "cvttps2dq\t{$src, $dst|$dst, $src}",
1533 (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
1534 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1535 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1536 "cvttps2dq\t{$src, $dst|$dst, $src}",
1538 (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
1540 Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1543 let Predicates = [HasAVX, NoVLX] in {
1544 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
1545 (VCVTTPS2DQrr VR128:$src)>;
1546 def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
1547 (VCVTTPS2DQrm addr:$src)>;
1548 def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
1549 (VCVTTPS2DQYrr VR256:$src)>;
1550 def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
1551 (VCVTTPS2DQYrm addr:$src)>;
1554 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1555 "cvttps2dq\t{$src, $dst|$dst, $src}",
1557 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1558 Sched<[WriteCvtPS2I]>;
1559 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1560 "cvttps2dq\t{$src, $dst|$dst, $src}",
1562 (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
1563 Sched<[WriteCvtPS2ILd]>;
1565 let Predicates = [UseSSE2] in {
1566 def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
1567 (CVTTPS2DQrr VR128:$src)>;
1568 def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
1569 (CVTTPS2DQrm addr:$src)>;
1572 let Predicates = [HasAVX, NoVLX] in
1573 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1574 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1576 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1577 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1579 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1580 // register, but the same isn't true when using memory operands instead.
1581 // Provide other assembly rr and rm forms to address this explicitly.
1584 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1585 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
1587 let Predicates = [HasAVX, NoVLX] in
1588 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1589 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1591 (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
1592 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1593 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1594 (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">;
1597 let Predicates = [HasAVX, NoVLX] in {
1598 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1599 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1601 (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
1602 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1603 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1604 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1606 (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
1607 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1609 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1610 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
1611 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1612 (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">;
1614 let Predicates = [HasAVX, NoVLX] in {
1615 def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
1616 (VCVTTPD2DQYrr VR256:$src)>;
1617 def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
1618 (VCVTTPD2DQYrm addr:$src)>;
1621 let Predicates = [HasAVX, NoVLX] in {
1622 def : Pat<(X86vzmovl (v2i64 (bitconvert
1623 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
1624 (VCVTPD2DQrr VR128:$src)>;
1625 def : Pat<(X86vzmovl (v2i64 (bitconvert
1626 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
1627 (VCVTPD2DQrm addr:$src)>;
1628 def : Pat<(X86vzmovl (v2i64 (bitconvert
1629 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
1630 (VCVTTPD2DQrr VR128:$src)>;
1631 def : Pat<(X86vzmovl (v2i64 (bitconvert
1632 (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
1633 (VCVTTPD2DQrm addr:$src)>;
1634 } // Predicates = [HasAVX, NoVLX]
1636 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1637 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1639 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1640 Sched<[WriteCvtPD2I]>;
1641 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1642 "cvttpd2dq\t{$src, $dst|$dst, $src}",
1644 (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
1645 Sched<[WriteCvtPD2ILd]>;
1647 let Predicates = [UseSSE2] in {
1648 def : Pat<(X86vzmovl (v2i64 (bitconvert
1649 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
1650 (CVTPD2DQrr VR128:$src)>;
1651 def : Pat<(X86vzmovl (v2i64 (bitconvert
1652 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
1653 (CVTPD2DQrm addr:$src)>;
1654 def : Pat<(X86vzmovl (v2i64 (bitconvert
1655 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
1656 (CVTTPD2DQrr VR128:$src)>;
1657 def : Pat<(X86vzmovl (v2i64 (bitconvert
1658 (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
1659 (CVTTPD2DQrm addr:$src)>;
1660 } // Predicates = [UseSSE2]
1662 // Convert packed single to packed double
1663 let Predicates = [HasAVX, NoVLX] in {
1664 // SSE2 instructions without OpSize prefix
1665 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1666 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1667 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1668 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1669 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1670 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1671 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1672 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1673 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1674 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1675 [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
1676 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1677 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1678 "vcvtps2pd\t{$src, $dst|$dst, $src}",
1679 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1680 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1683 let Predicates = [UseSSE2] in {
1684 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1685 "cvtps2pd\t{$src, $dst|$dst, $src}",
1686 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1687 PS, Sched<[WriteCvtPS2PD]>;
1688 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1689 "cvtps2pd\t{$src, $dst|$dst, $src}",
1690 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1691 PS, Sched<[WriteCvtPS2PD.Folded]>;
1694 // Convert Packed DW Integers to Packed Double FP
1695 let Predicates = [HasAVX, NoVLX] in {
1696 let hasSideEffects = 0, mayLoad = 1 in
1697 def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1698 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1700 (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
1701 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1702 def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1703 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1705 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1706 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1707 def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1708 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1710 (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
1711 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1713 def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1714 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1716 (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
1717 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1720 let hasSideEffects = 0, mayLoad = 1 in
1721 def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1722 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1724 (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
1725 Sched<[WriteCvtI2PDLd]>;
1726 def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1727 "cvtdq2pd\t{$src, $dst|$dst, $src}",
1729 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1730 Sched<[WriteCvtI2PD]>;
1732 // AVX register conversion intrinsics
1733 let Predicates = [HasAVX, NoVLX] in {
1734 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
1735 (VCVTDQ2PDrm addr:$src)>;
1736 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
1737 (VCVTDQ2PDrm addr:$src)>;
1738 } // Predicates = [HasAVX, NoVLX]
1740 // SSE2 register conversion intrinsics
1741 let Predicates = [UseSSE2] in {
1742 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
1743 (CVTDQ2PDrm addr:$src)>;
1744 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
1745 (CVTDQ2PDrm addr:$src)>;
1746 } // Predicates = [UseSSE2]
1748 // Convert packed double to packed single
1749 // The assembler can recognize rr 256-bit instructions by seeing a ymm
1750 // register, but the same isn't true when using memory operands instead.
1751 // Provide other assembly rr and rm forms to address this explicitly.
1752 let Predicates = [HasAVX, NoVLX] in
1753 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1754 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1755 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1756 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1759 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1760 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
1761 let Predicates = [HasAVX, NoVLX] in
1762 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1763 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1764 [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
1765 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1766 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1767 (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">;
1770 let Predicates = [HasAVX, NoVLX] in {
1771 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1772 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1773 [(set VR128:$dst, (fpround VR256:$src))]>,
1774 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1775 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1776 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1777 [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>,
1778 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1780 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1781 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
1782 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1783 (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">;
1785 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1786 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1787 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1788 Sched<[WriteCvtPD2PS]>;
1789 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1790 "cvtpd2ps\t{$src, $dst|$dst, $src}",
1791 [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
1792 Sched<[WriteCvtPD2PS.Folded]>;
1794 // AVX 256-bit register conversion intrinsics
1795 // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
1796 // whenever possible to avoid declaring two versions of each one.
1798 let Predicates = [HasAVX, NoVLX] in {
1799 // Match fpround and fpextend for 128/256-bit conversions
1800 def : Pat<(X86vzmovl (v2f64 (bitconvert
1801 (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
1802 (VCVTPD2PSrr VR128:$src)>;
1803 def : Pat<(X86vzmovl (v2f64 (bitconvert
1804 (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
1805 (VCVTPD2PSrm addr:$src)>;
1808 let Predicates = [UseSSE2] in {
1809 // Match fpround and fpextend for 128 conversions
1810 def : Pat<(X86vzmovl (v2f64 (bitconvert
1811 (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
1812 (CVTPD2PSrr VR128:$src)>;
1813 def : Pat<(X86vzmovl (v2f64 (bitconvert
1814 (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
1815 (CVTPD2PSrm addr:$src)>;
1818 //===----------------------------------------------------------------------===//
1819 // SSE 1 & 2 - Compare Instructions
1820 //===----------------------------------------------------------------------===//
1822 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1823 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1824 Operand CC, SDNode OpNode, ValueType VT,
1825 PatFrag ld_frag, string asm, string asm_alt,
1826 X86FoldableSchedWrite sched> {
1827 let isCommutable = 1 in
1828 def rr : SIi8<0xC2, MRMSrcReg,
1829 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
1830 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
1832 def rm : SIi8<0xC2, MRMSrcMem,
1833 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
1834 [(set RC:$dst, (OpNode (VT RC:$src1),
1835 (ld_frag addr:$src2), imm:$cc))]>,
1836 Sched<[sched.Folded, sched.ReadAfterFold]>;
1838 // Accept explicit immediate argument form instead of comparison code.
1839 let isAsmParserOnly = 1, hasSideEffects = 0 in {
1840 def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
1841 (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>,
1842 Sched<[sched]>, NotMemoryFoldable;
1844 def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
1845 (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>,
1846 Sched<[sched.Folded, sched.ReadAfterFold]>, NotMemoryFoldable;
1850 let ExeDomain = SSEPackedSingle in
1851 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
1852 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1853 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1854 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
1855 let ExeDomain = SSEPackedDouble in
1856 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
1857 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1858 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1859 SchedWriteFCmpSizes.PD.Scl>,
1860 XD, VEX_4V, VEX_LIG, VEX_WIG;
1862 let Constraints = "$src1 = $dst" in {
1863 let ExeDomain = SSEPackedSingle in
1864 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
1865 "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
1866 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1867 SchedWriteFCmpSizes.PS.Scl>, XS;
1868 let ExeDomain = SSEPackedDouble in
1869 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
1870 "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
1871 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1872 SchedWriteFCmpSizes.PD.Scl>, XD;
1875 multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
1876 Intrinsic Int, string asm, X86FoldableSchedWrite sched,
1877 ComplexPattern mem_cpat> {
1878 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1879 (ins VR128:$src1, VR128:$src, CC:$cc), asm,
1880 [(set VR128:$dst, (Int VR128:$src1,
1881 VR128:$src, imm:$cc))]>,
1884 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1885 (ins VR128:$src1, memop:$src, CC:$cc), asm,
1886 [(set VR128:$dst, (Int VR128:$src1,
1887 mem_cpat:$src, imm:$cc))]>,
1888 Sched<[sched.Folded, sched.ReadAfterFold]>;
1891 let isCodeGenOnly = 1 in {
1892 // Aliases to match intrinsics which expect XMM operand(s).
1893 let ExeDomain = SSEPackedSingle in
1894 defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
1895 "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1896 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V;
1897 let ExeDomain = SSEPackedDouble in
1898 defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
1899 "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1900 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1902 let Constraints = "$src1 = $dst" in {
1903 let ExeDomain = SSEPackedSingle in
1904 defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
1905 "cmp${cc}ss\t{$src, $dst|$dst, $src}",
1906 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1907 let ExeDomain = SSEPackedDouble in
1908 defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
1909 "cmp${cc}sd\t{$src, $dst|$dst, $src}",
1910 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1915 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1916 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1917 ValueType vt, X86MemOperand x86memop,
1918 PatFrag ld_frag, string OpcodeStr,
1919 X86FoldableSchedWrite sched> {
1920 let hasSideEffects = 0 in {
1921 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1922 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1923 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1926 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1927 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1928 [(set EFLAGS, (OpNode (vt RC:$src1),
1929 (ld_frag addr:$src2)))]>,
1930 Sched<[sched.Folded, sched.ReadAfterFold]>;
1934 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1935 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1936 ValueType vt, Operand memop,
1937 ComplexPattern mem_cpat, string OpcodeStr,
1938 X86FoldableSchedWrite sched> {
1939 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1940 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1941 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1944 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1945 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1946 [(set EFLAGS, (OpNode (vt RC:$src1),
1948 Sched<[sched.Folded, sched.ReadAfterFold]>;
1951 let Defs = [EFLAGS] in {
1952 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1953 "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1954 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1955 "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1956 let Pattern = []<dag> in {
1957 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1958 "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1959 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1960 "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1963 let isCodeGenOnly = 1 in {
1964 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1965 sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG;
1966 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1967 sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG;
1969 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1970 sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG;
1971 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1972 sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG;
1974 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1975 "ucomiss", WriteFCom>, PS;
1976 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1977 "ucomisd", WriteFCom>, PD;
1979 let Pattern = []<dag> in {
1980 defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1981 "comiss", WriteFCom>, PS;
1982 defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1983 "comisd", WriteFCom>, PD;
1986 let isCodeGenOnly = 1 in {
1987 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1988 sse_load_f32, "ucomiss", WriteFCom>, PS;
1989 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1990 sse_load_f64, "ucomisd", WriteFCom>, PD;
1992 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1993 sse_load_f32, "comiss", WriteFCom>, PS;
1994 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1995 sse_load_f64, "comisd", WriteFCom>, PD;
1997 } // Defs = [EFLAGS]
1999 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
2000 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
2001 Operand CC, ValueType VT, string asm,
2002 string asm_alt, X86FoldableSchedWrite sched,
2003 Domain d, PatFrag ld_frag> {
2004 let isCommutable = 1 in
2005 def rri : PIi8<0xC2, MRMSrcReg,
2006 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2007 [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
2009 def rmi : PIi8<0xC2, MRMSrcMem,
2010 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2012 (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
2013 Sched<[sched.Folded, sched.ReadAfterFold]>;
2015 // Accept explicit immediate argument form instead of comparison code.
2016 let isAsmParserOnly = 1, hasSideEffects = 0 in {
2017 def rri_alt : PIi8<0xC2, MRMSrcReg,
2018 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
2019 asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable;
2021 def rmi_alt : PIi8<0xC2, MRMSrcMem,
2022 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
2023 asm_alt, [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>,
2028 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
2029 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2030 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2031 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
2032 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
2033 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2034 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2035 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
2036 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
2037 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2038 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2039 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
2040 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
2041 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2042 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2043 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
2044 let Constraints = "$src1 = $dst" in {
2045 defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
2046 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2047 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2048 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
2049 defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
2050 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2051 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2052 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
2055 def CommutableCMPCC : PatLeaf<(imm), [{
2056 uint64_t Imm = N->getZExtValue() & 0x7;
2057 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
2060 // Patterns to select compares with loads in first operand.
2061 let Predicates = [HasAVX] in {
2062 def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
2063 CommutableCMPCC:$cc)),
2064 (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2066 def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
2067 CommutableCMPCC:$cc)),
2068 (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2070 def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
2071 CommutableCMPCC:$cc)),
2072 (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2074 def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
2075 CommutableCMPCC:$cc)),
2076 (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
2078 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2079 CommutableCMPCC:$cc)),
2080 (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
2082 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2083 CommutableCMPCC:$cc)),
2084 (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
2087 let Predicates = [UseSSE2] in {
2088 def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
2089 CommutableCMPCC:$cc)),
2090 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2092 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2093 CommutableCMPCC:$cc)),
2094 (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
2097 let Predicates = [UseSSE1] in {
2098 def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
2099 CommutableCMPCC:$cc)),
2100 (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
2102 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2103 CommutableCMPCC:$cc)),
2104 (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
2107 //===----------------------------------------------------------------------===//
2108 // SSE 1 & 2 - Shuffle Instructions
2109 //===----------------------------------------------------------------------===//
2111 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2112 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2113 ValueType vt, string asm, PatFrag mem_frag,
2114 X86FoldableSchedWrite sched, Domain d> {
2115 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2116 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2117 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2118 (i8 imm:$src3))))], d>,
2119 Sched<[sched.Folded, sched.ReadAfterFold]>;
2120 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2121 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2122 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2123 (i8 imm:$src3))))], d>,
2127 let Predicates = [HasAVX, NoVLX] in {
2128 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2129 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2130 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2131 PS, VEX_4V, VEX_WIG;
2132 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2133 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2134 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2135 PS, VEX_4V, VEX_L, VEX_WIG;
2136 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2137 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2138 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2139 PD, VEX_4V, VEX_WIG;
2140 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2141 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2142 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2143 PD, VEX_4V, VEX_L, VEX_WIG;
2145 let Constraints = "$src1 = $dst" in {
2146 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2147 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2148 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2149 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2150 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2151 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2154 //===----------------------------------------------------------------------===//
2155 // SSE 1 & 2 - Unpack FP Instructions
2156 //===----------------------------------------------------------------------===//
2158 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2159 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2160 PatFrag mem_frag, RegisterClass RC,
2161 X86MemOperand x86memop, string asm,
2162 X86FoldableSchedWrite sched, Domain d,
2163 bit IsCommutable = 0> {
2164 let isCommutable = IsCommutable in
2165 def rr : PI<opc, MRMSrcReg,
2166 (outs RC:$dst), (ins RC:$src1, RC:$src2),
2168 (vt (OpNode RC:$src1, RC:$src2)))], d>,
2170 def rm : PI<opc, MRMSrcMem,
2171 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2173 (vt (OpNode RC:$src1,
2174 (mem_frag addr:$src2))))], d>,
2175 Sched<[sched.Folded, sched.ReadAfterFold]>;
2178 let Predicates = [HasAVX, NoVLX] in {
2179 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2180 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2181 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2182 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2183 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2184 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2185 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2186 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2187 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2188 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2189 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2190 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2192 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2193 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2194 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2195 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2196 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2197 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2198 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2199 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2200 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2201 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2202 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2203 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2204 }// Predicates = [HasAVX, NoVLX]
2206 let Constraints = "$src1 = $dst" in {
2207 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2208 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2209 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2210 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2211 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2212 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2213 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2214 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2215 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2216 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2217 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2218 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2219 } // Constraints = "$src1 = $dst"
2221 let Predicates = [HasAVX1Only] in {
2222 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2223 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2224 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2225 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2226 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2227 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2228 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2229 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2231 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2232 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2233 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2234 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2235 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2236 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2237 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2238 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2241 //===----------------------------------------------------------------------===//
2242 // SSE 1 & 2 - Extract Floating-Point Sign mask
2243 //===----------------------------------------------------------------------===//
2245 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2246 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2247 string asm, Domain d> {
2248 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2249 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2250 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2251 Sched<[WriteFMOVMSK]>;
2254 let Predicates = [HasAVX] in {
2255 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2256 SSEPackedSingle>, PS, VEX, VEX_WIG;
2257 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2258 SSEPackedDouble>, PD, VEX, VEX_WIG;
2259 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2260 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2261 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2262 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2264 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2265 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2266 (VMOVMSKPSrr VR128:$src)>;
2267 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2268 (VMOVMSKPDrr VR128:$src)>;
2269 def : Pat<(X86movmsk (v8i32 VR256:$src)),
2270 (VMOVMSKPSYrr VR256:$src)>;
2271 def : Pat<(X86movmsk (v4i64 VR256:$src)),
2272 (VMOVMSKPDYrr VR256:$src)>;
2275 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2276 SSEPackedSingle>, PS;
2277 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2278 SSEPackedDouble>, PD;
2280 let Predicates = [UseSSE2] in {
2281 // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2282 def : Pat<(X86movmsk (v4i32 VR128:$src)),
2283 (MOVMSKPSrr VR128:$src)>;
2284 def : Pat<(X86movmsk (v2i64 VR128:$src)),
2285 (MOVMSKPDrr VR128:$src)>;
2288 //===---------------------------------------------------------------------===//
2289 // SSE2 - Packed Integer Logical Instructions
2290 //===---------------------------------------------------------------------===//
2292 let ExeDomain = SSEPackedInt in { // SSE integer instructions
2294 /// PDI_binop_rm - Simple SSE2 binary operator.
2295 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2296 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2297 X86MemOperand x86memop, X86FoldableSchedWrite sched,
2298 bit IsCommutable, bit Is2Addr> {
2299 let isCommutable = IsCommutable in
2300 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2301 (ins RC:$src1, RC:$src2),
2303 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2304 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2305 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2307 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2308 (ins RC:$src1, x86memop:$src2),
2310 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2311 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2312 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2313 Sched<[sched.Folded, sched.ReadAfterFold]>;
2315 } // ExeDomain = SSEPackedInt
2317 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2318 ValueType OpVT128, ValueType OpVT256,
2319 X86SchedWriteWidths sched, bit IsCommutable,
2321 let Predicates = [HasAVX, prd] in
2322 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2323 VR128, load, i128mem, sched.XMM,
2324 IsCommutable, 0>, VEX_4V, VEX_WIG;
2326 let Constraints = "$src1 = $dst" in
2327 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2328 memop, i128mem, sched.XMM, IsCommutable, 1>;
2330 let Predicates = [HasAVX2, prd] in
2331 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2332 OpVT256, VR256, load, i256mem, sched.YMM,
2333 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2336 // These are ordered here for pattern ordering requirements with the fp versions
2338 defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2339 SchedWriteVecLogic, 1, NoVLX>;
2340 defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2341 SchedWriteVecLogic, 1, NoVLX>;
2342 defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2343 SchedWriteVecLogic, 1, NoVLX>;
2344 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2345 SchedWriteVecLogic, 0, NoVLX>;
2347 //===----------------------------------------------------------------------===//
2348 // SSE 1 & 2 - Logical Instructions
2349 //===----------------------------------------------------------------------===//
2351 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2353 /// There are no patterns here because isel prefers integer versions for SSE2
2354 /// and later. There are SSE1 v4f32 patterns later.
2355 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2356 SDNode OpNode, X86SchedWriteWidths sched> {
2357 let Predicates = [HasAVX, NoVLX] in {
2358 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2359 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2360 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2362 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2363 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2364 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2366 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2367 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2368 [], [], 0>, PS, VEX_4V, VEX_WIG;
2370 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2371 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2372 [], [], 0>, PD, VEX_4V, VEX_WIG;
2375 let Constraints = "$src1 = $dst" in {
2376 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2377 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2380 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2381 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2386 defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2387 defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2388 defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2389 let isCommutable = 0 in
2390 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2392 let Predicates = [HasAVX2, NoVLX] in {
2393 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2394 (VPANDYrr VR256:$src1, VR256:$src2)>;
2395 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2396 (VPANDYrr VR256:$src1, VR256:$src2)>;
2397 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2398 (VPANDYrr VR256:$src1, VR256:$src2)>;
2400 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2401 (VPORYrr VR256:$src1, VR256:$src2)>;
2402 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2403 (VPORYrr VR256:$src1, VR256:$src2)>;
2404 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2405 (VPORYrr VR256:$src1, VR256:$src2)>;
2407 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2408 (VPXORYrr VR256:$src1, VR256:$src2)>;
2409 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2410 (VPXORYrr VR256:$src1, VR256:$src2)>;
2411 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2412 (VPXORYrr VR256:$src1, VR256:$src2)>;
2414 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2415 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2416 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2417 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2418 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2419 (VPANDNYrr VR256:$src1, VR256:$src2)>;
2421 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2422 (VPANDYrm VR256:$src1, addr:$src2)>;
2423 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2424 (VPANDYrm VR256:$src1, addr:$src2)>;
2425 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2426 (VPANDYrm VR256:$src1, addr:$src2)>;
2428 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2429 (VPORYrm VR256:$src1, addr:$src2)>;
2430 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2431 (VPORYrm VR256:$src1, addr:$src2)>;
2432 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2433 (VPORYrm VR256:$src1, addr:$src2)>;
2435 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2436 (VPXORYrm VR256:$src1, addr:$src2)>;
2437 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2438 (VPXORYrm VR256:$src1, addr:$src2)>;
2439 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2440 (VPXORYrm VR256:$src1, addr:$src2)>;
2442 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2443 (VPANDNYrm VR256:$src1, addr:$src2)>;
2444 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2445 (VPANDNYrm VR256:$src1, addr:$src2)>;
2446 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2447 (VPANDNYrm VR256:$src1, addr:$src2)>;
2450 // If only AVX1 is supported, we need to handle integer operations with
2451 // floating point instructions since the integer versions aren't available.
2452 let Predicates = [HasAVX1Only] in {
2453 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2454 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2455 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2456 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2457 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2458 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2459 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2460 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2462 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2463 (VORPSYrr VR256:$src1, VR256:$src2)>;
2464 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2465 (VORPSYrr VR256:$src1, VR256:$src2)>;
2466 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2467 (VORPSYrr VR256:$src1, VR256:$src2)>;
2468 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2469 (VORPSYrr VR256:$src1, VR256:$src2)>;
2471 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2472 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2473 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2474 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2475 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2476 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2477 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2478 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2480 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2481 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2482 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2483 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2484 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2485 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2486 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2487 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2489 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2490 (VANDPSYrm VR256:$src1, addr:$src2)>;
2491 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2492 (VANDPSYrm VR256:$src1, addr:$src2)>;
2493 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2494 (VANDPSYrm VR256:$src1, addr:$src2)>;
2495 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2496 (VANDPSYrm VR256:$src1, addr:$src2)>;
2498 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2499 (VORPSYrm VR256:$src1, addr:$src2)>;
2500 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2501 (VORPSYrm VR256:$src1, addr:$src2)>;
2502 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2503 (VORPSYrm VR256:$src1, addr:$src2)>;
2504 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2505 (VORPSYrm VR256:$src1, addr:$src2)>;
2507 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2508 (VXORPSYrm VR256:$src1, addr:$src2)>;
2509 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2510 (VXORPSYrm VR256:$src1, addr:$src2)>;
2511 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2512 (VXORPSYrm VR256:$src1, addr:$src2)>;
2513 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2514 (VXORPSYrm VR256:$src1, addr:$src2)>;
2516 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2517 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2518 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2519 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2520 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2521 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2522 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2523 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2526 let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
2527 // Use packed logical operations for scalar ops.
2528 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
2530 (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2531 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2533 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
2535 (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2536 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2538 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
2540 (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2541 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2543 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
2545 (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2546 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2549 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2551 (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2552 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2554 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2556 (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2557 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2559 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2561 (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2562 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2564 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
2566 (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2567 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2571 let Predicates = [UseSSE1] in {
2572 // Use packed logical operations for scalar ops.
2573 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2575 (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2576 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2578 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2580 (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2581 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2583 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2585 (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2586 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2588 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
2590 (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2591 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2595 let Predicates = [UseSSE2] in {
2596 // Use packed logical operations for scalar ops.
2597 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
2599 (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2600 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2602 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
2604 (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2605 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2607 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
2609 (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2610 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2612 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
2614 (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2615 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2619 let Predicates = [HasAVX, NoVLX] in {
2620 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2621 (VPANDrr VR128:$src1, VR128:$src2)>;
2622 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2623 (VPANDrr VR128:$src1, VR128:$src2)>;
2624 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2625 (VPANDrr VR128:$src1, VR128:$src2)>;
2627 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2628 (VPORrr VR128:$src1, VR128:$src2)>;
2629 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2630 (VPORrr VR128:$src1, VR128:$src2)>;
2631 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2632 (VPORrr VR128:$src1, VR128:$src2)>;
2634 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2635 (VPXORrr VR128:$src1, VR128:$src2)>;
2636 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2637 (VPXORrr VR128:$src1, VR128:$src2)>;
2638 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2639 (VPXORrr VR128:$src1, VR128:$src2)>;
2641 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2642 (VPANDNrr VR128:$src1, VR128:$src2)>;
2643 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2644 (VPANDNrr VR128:$src1, VR128:$src2)>;
2645 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2646 (VPANDNrr VR128:$src1, VR128:$src2)>;
2648 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2649 (VPANDrm VR128:$src1, addr:$src2)>;
2650 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2651 (VPANDrm VR128:$src1, addr:$src2)>;
2652 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2653 (VPANDrm VR128:$src1, addr:$src2)>;
2655 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2656 (VPORrm VR128:$src1, addr:$src2)>;
2657 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2658 (VPORrm VR128:$src1, addr:$src2)>;
2659 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2660 (VPORrm VR128:$src1, addr:$src2)>;
2662 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2663 (VPXORrm VR128:$src1, addr:$src2)>;
2664 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2665 (VPXORrm VR128:$src1, addr:$src2)>;
2666 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2667 (VPXORrm VR128:$src1, addr:$src2)>;
2669 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2670 (VPANDNrm VR128:$src1, addr:$src2)>;
2671 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2672 (VPANDNrm VR128:$src1, addr:$src2)>;
2673 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2674 (VPANDNrm VR128:$src1, addr:$src2)>;
2677 let Predicates = [UseSSE2] in {
2678 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2679 (PANDrr VR128:$src1, VR128:$src2)>;
2680 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2681 (PANDrr VR128:$src1, VR128:$src2)>;
2682 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2683 (PANDrr VR128:$src1, VR128:$src2)>;
2685 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2686 (PORrr VR128:$src1, VR128:$src2)>;
2687 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2688 (PORrr VR128:$src1, VR128:$src2)>;
2689 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2690 (PORrr VR128:$src1, VR128:$src2)>;
2692 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2693 (PXORrr VR128:$src1, VR128:$src2)>;
2694 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2695 (PXORrr VR128:$src1, VR128:$src2)>;
2696 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2697 (PXORrr VR128:$src1, VR128:$src2)>;
2699 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2700 (PANDNrr VR128:$src1, VR128:$src2)>;
2701 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2702 (PANDNrr VR128:$src1, VR128:$src2)>;
2703 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2704 (PANDNrr VR128:$src1, VR128:$src2)>;
2706 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2707 (PANDrm VR128:$src1, addr:$src2)>;
2708 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2709 (PANDrm VR128:$src1, addr:$src2)>;
2710 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2711 (PANDrm VR128:$src1, addr:$src2)>;
2713 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2714 (PORrm VR128:$src1, addr:$src2)>;
2715 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2716 (PORrm VR128:$src1, addr:$src2)>;
2717 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2718 (PORrm VR128:$src1, addr:$src2)>;
2720 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2721 (PXORrm VR128:$src1, addr:$src2)>;
2722 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2723 (PXORrm VR128:$src1, addr:$src2)>;
2724 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2725 (PXORrm VR128:$src1, addr:$src2)>;
2727 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2728 (PANDNrm VR128:$src1, addr:$src2)>;
2729 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2730 (PANDNrm VR128:$src1, addr:$src2)>;
2731 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2732 (PANDNrm VR128:$src1, addr:$src2)>;
2735 // Patterns for packed operations when we don't have integer type available.
2736 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2737 (ANDPSrr VR128:$src1, VR128:$src2)>;
2738 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2739 (ORPSrr VR128:$src1, VR128:$src2)>;
2740 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2741 (XORPSrr VR128:$src1, VR128:$src2)>;
2742 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2743 (ANDNPSrr VR128:$src1, VR128:$src2)>;
2745 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2746 (ANDPSrm VR128:$src1, addr:$src2)>;
2747 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2748 (ORPSrm VR128:$src1, addr:$src2)>;
2749 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2750 (XORPSrm VR128:$src1, addr:$src2)>;
2751 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2752 (ANDNPSrm VR128:$src1, addr:$src2)>;
2754 //===----------------------------------------------------------------------===//
2755 // SSE 1 & 2 - Arithmetic Instructions
2756 //===----------------------------------------------------------------------===//
2758 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2761 /// In addition, we also have a special variant of the scalar form here to
2762 /// represent the associated intrinsic operation. This form is unlike the
2763 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
2764 /// and leaves the top elements unmodified (therefore these cannot be commuted).
2766 /// These three forms can each be reg+reg or reg+mem.
2769 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2771 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2772 SDNode OpNode, X86SchedWriteSizes sched> {
2773 let Predicates = [HasAVX, NoVLX] in {
2774 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2775 VR128, v4f32, f128mem, loadv4f32,
2776 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2777 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2778 VR128, v2f64, f128mem, loadv2f64,
2779 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2781 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2782 OpNode, VR256, v8f32, f256mem, loadv8f32,
2783 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2784 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2785 OpNode, VR256, v4f64, f256mem, loadv4f64,
2786 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2789 let Constraints = "$src1 = $dst" in {
2790 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2791 v4f32, f128mem, memopv4f32, SSEPackedSingle,
2793 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2794 v2f64, f128mem, memopv2f64, SSEPackedDouble,
2799 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2800 X86SchedWriteSizes sched> {
2801 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2802 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2803 XS, VEX_4V, VEX_LIG, VEX_WIG;
2804 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2805 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2806 XD, VEX_4V, VEX_LIG, VEX_WIG;
2808 let Constraints = "$src1 = $dst" in {
2809 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2810 OpNode, FR32, f32mem, SSEPackedSingle,
2812 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2813 OpNode, FR64, f64mem, SSEPackedDouble,
2818 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2819 SDPatternOperator OpNode,
2820 X86SchedWriteSizes sched> {
2821 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2822 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2823 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2824 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2825 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2826 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2828 let Constraints = "$src1 = $dst" in {
2829 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2830 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2831 SSEPackedSingle, sched.PS.Scl>, XS;
2832 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2833 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2834 SSEPackedDouble, sched.PD.Scl>, XD;
2838 // Binary Arithmetic instructions
2839 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
2840 basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
2841 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2842 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
2843 basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
2844 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2845 let isCommutable = 0 in {
2846 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2847 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2848 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2849 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2850 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2851 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2852 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2853 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2854 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2855 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2856 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2857 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2860 let isCodeGenOnly = 1 in {
2861 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2862 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2863 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2864 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2867 // Patterns used to select SSE scalar fp arithmetic instructions from
2870 // (1) a scalar fp operation followed by a blend
2872 // The effect is that the backend no longer emits unnecessary vector
2873 // insert instructions immediately after SSE scalar fp instructions
2874 // like addss or mulss.
2876 // For example, given the following code:
2877 // __m128 foo(__m128 A, __m128 B) {
2882 // Previously we generated:
2883 // addss %xmm0, %xmm1
2884 // movss %xmm1, %xmm0
2887 // addss %xmm1, %xmm0
2889 // (2) a vector packed single/double fp operation followed by a vector insert
2891 // The effect is that the backend converts the packed fp instruction
2892 // followed by a vector insert into a single SSE scalar fp instruction.
2894 // For example, given the following code:
2895 // __m128 foo(__m128 A, __m128 B) {
2896 // __m128 C = A + B;
2897 // return (__m128) {c[0], a[1], a[2], a[3]};
2900 // Previously we generated:
2901 // addps %xmm0, %xmm1
2902 // movss %xmm1, %xmm0
2905 // addss %xmm1, %xmm0
2907 // TODO: Some canonicalization in lowering would simplify the number of
2908 // patterns we have to try to match.
2909 multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2910 ValueType VT, ValueType EltTy,
2911 RegisterClass RC, Predicate BasePredicate> {
2912 let Predicates = [BasePredicate] in {
2913 // extracted scalar math op with insert via movss/movsd
2914 def : Pat<(VT (Move (VT VR128:$dst),
2915 (VT (scalar_to_vector
2916 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2918 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2919 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2922 // Repeat for AVX versions of the instructions.
2923 let Predicates = [UseAVX] in {
2924 // extracted scalar math op with insert via movss/movsd
2925 def : Pat<(VT (Move (VT VR128:$dst),
2926 (VT (scalar_to_vector
2927 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2929 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2930 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2934 defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2935 defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2936 defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2937 defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
2939 defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2940 defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2941 defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2942 defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2945 /// In addition, we also have a special variant of the scalar form here to
2946 /// represent the associated intrinsic operation. This form is unlike the
2947 /// plain scalar form, in that it takes an entire vector (instead of a
2948 /// scalar) and leaves the top elements undefined.
2950 /// And, we have a special variant form for a full-vector intrinsic form.
2952 /// sse_fp_unop_s - SSE1 unops in scalar form
2953 /// For the non-AVX defs, we need $src1 to be tied to $dst because
2954 /// the HW instructions are 2 operand / destructive.
2955 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2956 ValueType ScalarVT, X86MemOperand x86memop,
2957 Operand intmemop, SDNode OpNode, Domain d,
2958 X86FoldableSchedWrite sched, Predicate target> {
2959 let hasSideEffects = 0 in {
2960 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2961 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2962 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2965 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2966 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2967 [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2968 Sched<[sched.Folded]>,
2969 Requires<[target, OptForSize]>;
2971 let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
2972 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2973 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2976 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2977 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2978 Sched<[sched.Folded, sched.ReadAfterFold]>;
2984 multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2985 ComplexPattern int_cpat, Intrinsic Intr,
2986 Predicate target, string Suffix> {
2987 let Predicates = [target] in {
2988 // These are unary operations, but they are modeled as having 2 source operands
2989 // because the high elements of the destination are unchanged in SSE.
2990 def : Pat<(Intr VR128:$src),
2991 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2993 // We don't want to fold scalar loads into these instructions unless
2994 // optimizing for size. This is because the folded instruction will have a
2995 // partial register update, while the unfolded sequence will not, e.g.
2997 // rcpss %xmm0, %xmm0
2998 // which has a clobber before the rcp, vs.
3000 let Predicates = [target, OptForSize] in {
3001 def : Pat<(Intr int_cpat:$src2),
3002 (!cast<Instruction>(NAME#m_Int)
3003 (vt (IMPLICIT_DEF)), addr:$src2)>;
3007 multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
3008 Intrinsic Intr, Predicate target> {
3009 let Predicates = [target] in {
3010 def : Pat<(Intr VR128:$src),
3011 (!cast<Instruction>(NAME#r_Int) VR128:$src,
3014 let Predicates = [target, OptForSize] in {
3015 def : Pat<(Intr int_cpat:$src2),
3016 (!cast<Instruction>(NAME#m_Int)
3017 (vt (IMPLICIT_DEF)), addr:$src2)>;
3021 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
3022 ValueType ScalarVT, X86MemOperand x86memop,
3023 Operand intmemop, SDNode OpNode, Domain d,
3024 X86FoldableSchedWrite sched, Predicate target> {
3025 let hasSideEffects = 0 in {
3026 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
3027 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3028 [], d>, Sched<[sched]>;
3030 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3031 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3032 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
3033 let isCodeGenOnly = 1, ExeDomain = d in {
3034 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
3035 (ins VR128:$src1, VR128:$src2),
3036 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3037 []>, Sched<[sched]>;
3039 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
3040 (ins VR128:$src1, intmemop:$src2),
3041 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3042 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
3046 // We don't want to fold scalar loads into these instructions unless
3047 // optimizing for size. This is because the folded instruction will have a
3048 // partial register update, while the unfolded sequence will not, e.g.
3049 // vmovss mem, %xmm0
3050 // vrcpss %xmm0, %xmm0, %xmm0
3051 // which has a clobber before the rcp, vs.
3052 // vrcpss mem, %xmm0, %xmm0
3053 // TODO: In theory, we could fold the load, and avoid the stall caused by
3054 // the partial register store, either in BreakFalseDeps or with smarter RA.
3055 let Predicates = [target] in {
3056 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
3057 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
3059 let Predicates = [target, OptForSize] in {
3060 def : Pat<(ScalarVT (OpNode (load addr:$src))),
3061 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
3066 /// sse1_fp_unop_p - SSE1 unops in packed form.
3067 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
3068 X86SchedWriteWidths sched, list<Predicate> prds> {
3069 let Predicates = prds in {
3070 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3071 !strconcat("v", OpcodeStr,
3072 "ps\t{$src, $dst|$dst, $src}"),
3073 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
3074 VEX, Sched<[sched.XMM]>, VEX_WIG;
3075 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3076 !strconcat("v", OpcodeStr,
3077 "ps\t{$src, $dst|$dst, $src}"),
3078 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
3079 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
3080 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3081 !strconcat("v", OpcodeStr,
3082 "ps\t{$src, $dst|$dst, $src}"),
3083 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
3084 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3085 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3086 !strconcat("v", OpcodeStr,
3087 "ps\t{$src, $dst|$dst, $src}"),
3088 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
3089 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
3092 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3093 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3094 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
3096 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3097 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3098 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
3099 Sched<[sched.XMM.Folded]>;
3102 /// sse2_fp_unop_p - SSE2 unops in vector forms.
3103 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
3104 SDNode OpNode, X86SchedWriteWidths sched> {
3105 let Predicates = [HasAVX, NoVLX] in {
3106 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3107 !strconcat("v", OpcodeStr,
3108 "pd\t{$src, $dst|$dst, $src}"),
3109 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
3110 VEX, Sched<[sched.XMM]>, VEX_WIG;
3111 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3112 !strconcat("v", OpcodeStr,
3113 "pd\t{$src, $dst|$dst, $src}"),
3114 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
3115 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
3116 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3117 !strconcat("v", OpcodeStr,
3118 "pd\t{$src, $dst|$dst, $src}"),
3119 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
3120 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3121 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3122 !strconcat("v", OpcodeStr,
3123 "pd\t{$src, $dst|$dst, $src}"),
3124 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
3125 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
3128 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3129 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3130 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
3132 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3133 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3134 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
3135 Sched<[sched.XMM.Folded]>;
3138 multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
3139 X86SchedWriteWidths sched, Predicate AVXTarget> {
3140 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
3141 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
3143 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
3144 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
3146 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
3149 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3150 X86SchedWriteWidths sched, Predicate AVXTarget> {
3151 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
3152 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
3153 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
3154 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
3155 XS, VEX_4V, VEX_LIG, VEX_WIG;
3158 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3159 X86SchedWriteWidths sched, Predicate AVXTarget> {
3160 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
3161 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
3162 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
3163 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
3164 XD, VEX_4V, VEX_LIG, VEX_WIG;
3168 defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
3169 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3170 sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
3171 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
3173 // Reciprocal approximations. Note that these typically require refinement
3174 // in order to obtain suitable precision.
3175 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3176 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3177 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3178 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3179 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3180 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3182 // There is no f64 version of the reciprocal approximation instructions.
3184 multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
3185 ValueType VT, Predicate BasePredicate> {
3186 let Predicates = [BasePredicate] in {
3187 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3188 (OpNode (extractelt VT:$src, 0))))),
3189 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3192 // Repeat for AVX versions of the instructions.
3193 let Predicates = [UseAVX] in {
3194 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3195 (OpNode (extractelt VT:$src, 0))))),
3196 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3200 multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
3201 ValueType VT, bits<8> ImmV,
3202 Predicate BasePredicate> {
3203 let Predicates = [BasePredicate] in {
3204 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3205 (OpNode (extractelt VT:$src, 0))))),
3206 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
3209 // Repeat for AVX versions of the instructions.
3210 let Predicates = [UseAVX] in {
3211 def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3212 (OpNode (extractelt VT:$src, 0))))),
3213 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
3217 defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3218 defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3220 multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3221 SDNode Move, ValueType VT,
3222 Predicate BasePredicate> {
3223 let Predicates = [BasePredicate] in {
3224 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3225 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3228 // Repeat for AVX versions of the instructions.
3229 let Predicates = [HasAVX] in {
3230 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3231 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3235 defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3237 defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3241 //===----------------------------------------------------------------------===//
3242 // SSE 1 & 2 - Non-temporal stores
3243 //===----------------------------------------------------------------------===//
3245 let AddedComplexity = 400 in { // Prefer non-temporal versions
3246 let Predicates = [HasAVX, NoVLX] in {
3247 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3248 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3249 (ins f128mem:$dst, VR128:$src),
3250 "movntps\t{$src, $dst|$dst, $src}",
3251 [(alignednontemporalstore (v4f32 VR128:$src),
3252 addr:$dst)]>, VEX, VEX_WIG;
3253 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3254 (ins f128mem:$dst, VR128:$src),
3255 "movntpd\t{$src, $dst|$dst, $src}",
3256 [(alignednontemporalstore (v2f64 VR128:$src),
3257 addr:$dst)]>, VEX, VEX_WIG;
3260 let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3261 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3262 (ins f256mem:$dst, VR256:$src),
3263 "movntps\t{$src, $dst|$dst, $src}",
3264 [(alignednontemporalstore (v8f32 VR256:$src),
3265 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3266 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3267 (ins f256mem:$dst, VR256:$src),
3268 "movntpd\t{$src, $dst|$dst, $src}",
3269 [(alignednontemporalstore (v4f64 VR256:$src),
3270 addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3273 let ExeDomain = SSEPackedInt in {
3274 def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
3275 (ins i128mem:$dst, VR128:$src),
3276 "movntdq\t{$src, $dst|$dst, $src}",
3277 [(alignednontemporalstore (v2i64 VR128:$src),
3278 addr:$dst)]>, VEX, VEX_WIG,
3279 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3280 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3281 (ins i256mem:$dst, VR256:$src),
3282 "movntdq\t{$src, $dst|$dst, $src}",
3283 [(alignednontemporalstore (v4i64 VR256:$src),
3284 addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3285 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3289 let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3290 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3291 "movntps\t{$src, $dst|$dst, $src}",
3292 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3293 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3294 "movntpd\t{$src, $dst|$dst, $src}",
3295 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3298 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3299 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3300 "movntdq\t{$src, $dst|$dst, $src}",
3301 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3303 let SchedRW = [WriteStoreNT] in {
3304 // There is no AVX form for instructions below this point
3305 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3306 "movnti{l}\t{$src, $dst|$dst, $src}",
3307 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3308 PS, Requires<[HasSSE2]>;
3309 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3310 "movnti{q}\t{$src, $dst|$dst, $src}",
3311 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3312 PS, Requires<[HasSSE2]>;
3313 } // SchedRW = [WriteStoreNT]
3315 let Predicates = [HasAVX, NoVLX] in {
3316 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3317 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3318 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3319 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3320 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3321 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3323 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3324 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3325 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3326 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3327 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3328 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3331 let Predicates = [UseSSE2] in {
3332 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3333 (MOVNTDQmr addr:$dst, VR128:$src)>;
3334 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3335 (MOVNTDQmr addr:$dst, VR128:$src)>;
3336 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3337 (MOVNTDQmr addr:$dst, VR128:$src)>;
3340 } // AddedComplexity
3342 //===----------------------------------------------------------------------===//
3343 // SSE 1 & 2 - Prefetch and memory fence
3344 //===----------------------------------------------------------------------===//
3346 // Prefetch intrinsic.
3347 let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3348 def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3349 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3350 def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3351 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3352 def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3353 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3354 def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3355 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3358 // FIXME: How should flush instruction be modeled?
3359 let SchedRW = [WriteLoad] in {
3361 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3362 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3363 PS, Requires<[HasSSE2]>;
3366 let SchedRW = [WriteNop] in {
3367 // Pause. This "instruction" is encoded as "rep; nop", so even though it
3368 // was introduced with SSE2, it's backward compatible.
3369 def PAUSE : I<0x90, RawFrm, (outs), (ins),
3370 "pause", [(int_x86_sse2_pause)]>, OBXS;
3373 let SchedRW = [WriteFence] in {
3374 // Load, store, and memory fence
3375 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence
3376 // to include any 64-bit target.
3377 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3378 PS, Requires<[HasSSE1]>;
3379 def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3380 PS, Requires<[HasSSE2]>;
3381 def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3382 PS, Requires<[HasMFence]>;
3385 def : Pat<(X86MFence), (MFENCE)>;
3387 //===----------------------------------------------------------------------===//
3388 // SSE 1 & 2 - Load/Store XCSR register
3389 //===----------------------------------------------------------------------===//
3391 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3392 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3393 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3394 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3395 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3396 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3398 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3399 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3400 TB, Sched<[WriteLDMXCSR]>;
3401 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3402 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3403 TB, Sched<[WriteSTMXCSR]>;
3405 //===---------------------------------------------------------------------===//
3406 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3407 //===---------------------------------------------------------------------===//
3409 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3411 let hasSideEffects = 0 in {
3412 def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3413 "movdqa\t{$src, $dst|$dst, $src}", []>,
3414 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3415 def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3416 "movdqu\t{$src, $dst|$dst, $src}", []>,
3417 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3418 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3419 "movdqa\t{$src, $dst|$dst, $src}", []>,
3420 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3421 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3422 "movdqu\t{$src, $dst|$dst, $src}", []>,
3423 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3427 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3428 def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3429 "movdqa\t{$src, $dst|$dst, $src}", []>,
3430 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3431 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3432 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3433 "movdqa\t{$src, $dst|$dst, $src}", []>,
3434 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3435 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3436 def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3437 "movdqu\t{$src, $dst|$dst, $src}", []>,
3438 Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3439 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3440 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3441 "movdqu\t{$src, $dst|$dst, $src}", []>,
3442 Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3443 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3446 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3447 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3448 def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3449 "movdqa\t{$src, $dst|$dst, $src}",
3450 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3451 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3452 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3453 "movdqa\t{$src, $dst|$dst, $src}", []>,
3454 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3455 VEX, VEX_L, VEX_WIG;
3456 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3457 "vmovdqu\t{$src, $dst|$dst, $src}",
3458 [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3459 Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3461 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3462 "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3463 Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3464 XS, VEX, VEX_L, VEX_WIG;
3467 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3468 def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
3469 (ins i128mem:$dst, VR128:$src),
3470 "movdqa\t{$src, $dst|$dst, $src}",
3471 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3472 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3473 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3474 (ins i256mem:$dst, VR256:$src),
3475 "movdqa\t{$src, $dst|$dst, $src}", []>,
3476 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3477 def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3478 "vmovdqu\t{$src, $dst|$dst, $src}",
3479 [(store (v2i64 VR128:$src), addr:$dst)]>,
3480 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3481 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3482 "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3483 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3486 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3487 let hasSideEffects = 0 in {
3488 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3489 "movdqa\t{$src, $dst|$dst, $src}", []>;
3491 def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3492 "movdqu\t{$src, $dst|$dst, $src}", []>,
3493 XS, Requires<[UseSSE2]>;
3497 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3498 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3499 "movdqa\t{$src, $dst|$dst, $src}", []>,
3500 FoldGenData<"MOVDQArr">;
3502 def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3503 "movdqu\t{$src, $dst|$dst, $src}", []>,
3504 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3508 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3509 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3510 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3511 "movdqa\t{$src, $dst|$dst, $src}",
3512 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3513 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3514 "movdqu\t{$src, $dst|$dst, $src}",
3515 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3516 XS, Requires<[UseSSE2]>;
3519 let mayStore = 1, hasSideEffects = 0,
3520 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3521 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3522 "movdqa\t{$src, $dst|$dst, $src}",
3523 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3524 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3525 "movdqu\t{$src, $dst|$dst, $src}",
3526 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3527 XS, Requires<[UseSSE2]>;
3530 } // ExeDomain = SSEPackedInt
3532 // Aliases to help the assembler pick two byte VEX encodings by swapping the
3533 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
3534 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
3535 (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
3536 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
3537 (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
3538 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
3539 (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
3540 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
3541 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
3543 // Reversed version with ".s" suffix for GAS compatibility.
3544 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3545 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3546 def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3547 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3548 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3549 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3550 def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3551 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3553 // Reversed version with ".s" suffix for GAS compatibility.
3554 def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3555 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3556 def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3557 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3559 let Predicates = [HasAVX, NoVLX] in {
3560 // Additional patterns for other integer sizes.
3561 def : Pat<(alignedloadv4i32 addr:$src),
3562 (VMOVDQArm addr:$src)>;
3563 def : Pat<(alignedloadv8i16 addr:$src),
3564 (VMOVDQArm addr:$src)>;
3565 def : Pat<(alignedloadv16i8 addr:$src),
3566 (VMOVDQArm addr:$src)>;
3567 def : Pat<(loadv4i32 addr:$src),
3568 (VMOVDQUrm addr:$src)>;
3569 def : Pat<(loadv8i16 addr:$src),
3570 (VMOVDQUrm addr:$src)>;
3571 def : Pat<(loadv16i8 addr:$src),
3572 (VMOVDQUrm addr:$src)>;
3574 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3575 (VMOVDQAmr addr:$dst, VR128:$src)>;
3576 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3577 (VMOVDQAmr addr:$dst, VR128:$src)>;
3578 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3579 (VMOVDQAmr addr:$dst, VR128:$src)>;
3580 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3581 (VMOVDQUmr addr:$dst, VR128:$src)>;
3582 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3583 (VMOVDQUmr addr:$dst, VR128:$src)>;
3584 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3585 (VMOVDQUmr addr:$dst, VR128:$src)>;
3588 //===---------------------------------------------------------------------===//
3589 // SSE2 - Packed Integer Arithmetic Instructions
3590 //===---------------------------------------------------------------------===//
3592 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3594 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3595 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3596 ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3597 PatFrag memop_frag, X86MemOperand x86memop,
3598 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3599 let isCommutable = 1 in
3600 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3601 (ins RC:$src1, RC:$src2),
3603 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3604 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3605 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3607 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3608 (ins RC:$src1, x86memop:$src2),
3610 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3611 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3612 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3613 (memop_frag addr:$src2))))]>,
3614 Sched<[sched.Folded, sched.ReadAfterFold]>;
3616 } // ExeDomain = SSEPackedInt
3618 defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3619 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3620 defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3621 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3622 defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3623 SchedWriteVecALU, 1, NoVLX>;
3624 defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3625 SchedWriteVecALU, 1, NoVLX>;
3626 defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3627 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3628 defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3629 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3630 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3631 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3632 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3633 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3634 defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3635 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3636 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3637 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3638 defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3639 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3640 defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3641 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3642 defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3643 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3644 defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3645 SchedWriteVecALU, 0, NoVLX>;
3646 defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3647 SchedWriteVecALU, 0, NoVLX>;
3648 defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3649 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3650 defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3651 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3652 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3653 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3654 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3655 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3656 defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3657 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3658 defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3659 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3660 defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3661 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3662 defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3663 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3664 defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3665 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3666 defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3667 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3668 defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3669 SchedWriteVecIMul, 1, NoVLX>;
3671 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3672 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3673 load, i128mem, SchedWriteVecIMul.XMM, 0>,
3676 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3677 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3678 VR256, load, i256mem, SchedWriteVecIMul.YMM,
3679 0>, VEX_4V, VEX_L, VEX_WIG;
3680 let Constraints = "$src1 = $dst" in
3681 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3682 memop, i128mem, SchedWriteVecIMul.XMM>;
3684 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3685 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3686 load, i128mem, SchedWritePSADBW.XMM, 0>,
3688 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3689 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3690 load, i256mem, SchedWritePSADBW.YMM, 0>,
3691 VEX_4V, VEX_L, VEX_WIG;
3692 let Constraints = "$src1 = $dst" in
3693 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3694 memop, i128mem, SchedWritePSADBW.XMM>;
3696 //===---------------------------------------------------------------------===//
3697 // SSE2 - Packed Integer Logical Instructions
3698 //===---------------------------------------------------------------------===//
3700 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3701 string OpcodeStr, SDNode OpNode,
3702 SDNode OpNode2, RegisterClass RC,
3703 X86FoldableSchedWrite sched,
3704 X86FoldableSchedWrite schedImm,
3705 ValueType DstVT, ValueType SrcVT,
3706 PatFrag ld_frag, bit Is2Addr = 1> {
3707 // src2 is always 128-bit
3708 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3709 (ins RC:$src1, VR128:$src2),
3711 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3712 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3713 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3715 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3716 (ins RC:$src1, i128mem:$src2),
3718 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3719 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3720 [(set RC:$dst, (DstVT (OpNode RC:$src1,
3721 (SrcVT (ld_frag addr:$src2)))))]>,
3722 Sched<[sched.Folded, sched.ReadAfterFold]>;
3723 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3724 (ins RC:$src1, u8imm:$src2),
3726 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3727 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3728 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
3732 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3733 string OpcodeStr, SDNode OpNode,
3734 SDNode OpNode2, ValueType DstVT128,
3735 ValueType DstVT256, ValueType SrcVT,
3736 X86SchedWriteWidths sched,
3737 X86SchedWriteWidths schedImm, Predicate prd> {
3738 let Predicates = [HasAVX, prd] in
3739 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3740 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3741 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3742 let Predicates = [HasAVX2, prd] in
3743 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3744 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3745 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3747 let Constraints = "$src1 = $dst" in
3748 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3749 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3753 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3754 SDNode OpNode, RegisterClass RC, ValueType VT,
3755 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3756 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3758 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3759 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3760 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
3764 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3765 SDNode OpNode, X86SchedWriteWidths sched> {
3766 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3767 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3768 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3769 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3770 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3771 VR256, v32i8, sched.YMM, 0>,
3772 VEX_4V, VEX_L, VEX_WIG;
3773 let Constraints = "$src1 = $dst" in
3774 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3778 let ExeDomain = SSEPackedInt in {
3779 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3780 v8i16, v16i16, v8i16, SchedWriteVecShift,
3781 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3782 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3783 v4i32, v8i32, v4i32, SchedWriteVecShift,
3784 SchedWriteVecShiftImm, NoVLX>;
3785 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3786 v2i64, v4i64, v2i64, SchedWriteVecShift,
3787 SchedWriteVecShiftImm, NoVLX>;
3789 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3790 v8i16, v16i16, v8i16, SchedWriteVecShift,
3791 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3792 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3793 v4i32, v8i32, v4i32, SchedWriteVecShift,
3794 SchedWriteVecShiftImm, NoVLX>;
3795 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3796 v2i64, v4i64, v2i64, SchedWriteVecShift,
3797 SchedWriteVecShiftImm, NoVLX>;
3799 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3800 v8i16, v16i16, v8i16, SchedWriteVecShift,
3801 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3802 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3803 v4i32, v8i32, v4i32, SchedWriteVecShift,
3804 SchedWriteVecShiftImm, NoVLX>;
3806 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3808 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3810 } // ExeDomain = SSEPackedInt
3812 //===---------------------------------------------------------------------===//
3813 // SSE2 - Packed Integer Comparison Instructions
3814 //===---------------------------------------------------------------------===//
3816 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3817 SchedWriteVecALU, 1, TruePredicate>;
3818 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3819 SchedWriteVecALU, 1, TruePredicate>;
3820 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3821 SchedWriteVecALU, 1, TruePredicate>;
3822 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3823 SchedWriteVecALU, 0, TruePredicate>;
3824 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3825 SchedWriteVecALU, 0, TruePredicate>;
3826 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3827 SchedWriteVecALU, 0, TruePredicate>;
3829 //===---------------------------------------------------------------------===//
3830 // SSE2 - Packed Integer Shuffle Instructions
3831 //===---------------------------------------------------------------------===//
3833 let ExeDomain = SSEPackedInt in {
3834 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3835 SDNode OpNode, X86SchedWriteWidths sched,
3837 let Predicates = [HasAVX, prd] in {
3838 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3839 (ins VR128:$src1, u8imm:$src2),
3840 !strconcat("v", OpcodeStr,
3841 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3843 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3844 VEX, Sched<[sched.XMM]>, VEX_WIG;
3845 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3846 (ins i128mem:$src1, u8imm:$src2),
3847 !strconcat("v", OpcodeStr,
3848 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3850 (vt128 (OpNode (load addr:$src1),
3851 (i8 imm:$src2))))]>, VEX,
3852 Sched<[sched.XMM.Folded]>, VEX_WIG;
3855 let Predicates = [HasAVX2, prd] in {
3856 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3857 (ins VR256:$src1, u8imm:$src2),
3858 !strconcat("v", OpcodeStr,
3859 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3861 (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
3862 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3863 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3864 (ins i256mem:$src1, u8imm:$src2),
3865 !strconcat("v", OpcodeStr,
3866 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3868 (vt256 (OpNode (load addr:$src1),
3869 (i8 imm:$src2))))]>, VEX, VEX_L,
3870 Sched<[sched.YMM.Folded]>, VEX_WIG;
3873 let Predicates = [UseSSE2] in {
3874 def ri : Ii8<0x70, MRMSrcReg,
3875 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3876 !strconcat(OpcodeStr,
3877 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3879 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3881 def mi : Ii8<0x70, MRMSrcMem,
3882 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3883 !strconcat(OpcodeStr,
3884 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3886 (vt128 (OpNode (memop addr:$src1),
3887 (i8 imm:$src2))))]>,
3888 Sched<[sched.XMM.Folded]>;
3891 } // ExeDomain = SSEPackedInt
3893 defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3894 SchedWriteShuffle, NoVLX>, PD;
3895 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3896 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3897 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3898 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3900 //===---------------------------------------------------------------------===//
3901 // Packed Integer Pack Instructions (SSE & AVX)
3902 //===---------------------------------------------------------------------===//
3904 let ExeDomain = SSEPackedInt in {
3905 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3906 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3907 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3908 PatFrag ld_frag, bit Is2Addr = 1> {
3909 def rr : PDI<opc, MRMSrcReg,
3910 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3912 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3913 !strconcat(OpcodeStr,
3914 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3916 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3918 def rm : PDI<opc, MRMSrcMem,
3919 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3921 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3922 !strconcat(OpcodeStr,
3923 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3925 (OutVT (OpNode (ArgVT RC:$src1),
3926 (ld_frag addr:$src2))))]>,
3927 Sched<[sched.Folded, sched.ReadAfterFold]>;
3930 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3931 ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3932 X86MemOperand x86memop, X86FoldableSchedWrite sched,
3933 PatFrag ld_frag, bit Is2Addr = 1> {
3934 def rr : SS48I<opc, MRMSrcReg,
3935 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3937 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3938 !strconcat(OpcodeStr,
3939 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3941 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3943 def rm : SS48I<opc, MRMSrcMem,
3944 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3946 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3947 !strconcat(OpcodeStr,
3948 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3950 (OutVT (OpNode (ArgVT RC:$src1),
3951 (ld_frag addr:$src2))))]>,
3952 Sched<[sched.Folded, sched.ReadAfterFold]>;
3955 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3956 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3957 i128mem, SchedWriteShuffle.XMM, load, 0>,
3959 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3960 i128mem, SchedWriteShuffle.XMM, load, 0>,
3963 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3964 i128mem, SchedWriteShuffle.XMM, load, 0>,
3966 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3967 i128mem, SchedWriteShuffle.XMM, load, 0>,
3971 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3972 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3973 i256mem, SchedWriteShuffle.YMM, load, 0>,
3974 VEX_4V, VEX_L, VEX_WIG;
3975 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3976 i256mem, SchedWriteShuffle.YMM, load, 0>,
3977 VEX_4V, VEX_L, VEX_WIG;
3979 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3980 i256mem, SchedWriteShuffle.YMM, load, 0>,
3981 VEX_4V, VEX_L, VEX_WIG;
3982 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3983 i256mem, SchedWriteShuffle.YMM, load, 0>,
3987 let Constraints = "$src1 = $dst" in {
3988 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3989 i128mem, SchedWriteShuffle.XMM, memop>;
3990 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3991 i128mem, SchedWriteShuffle.XMM, memop>;
3993 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3994 i128mem, SchedWriteShuffle.XMM, memop>;
3996 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3997 i128mem, SchedWriteShuffle.XMM, memop>;
3999 } // ExeDomain = SSEPackedInt
4001 //===---------------------------------------------------------------------===//
4002 // SSE2 - Packed Integer Unpack Instructions
4003 //===---------------------------------------------------------------------===//
4005 let ExeDomain = SSEPackedInt in {
4006 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
4007 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
4008 X86FoldableSchedWrite sched, PatFrag ld_frag,
4010 def rr : PDI<opc, MRMSrcReg,
4011 (outs RC:$dst), (ins RC:$src1, RC:$src2),
4013 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4014 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4015 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4017 def rm : PDI<opc, MRMSrcMem,
4018 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4020 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4021 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4022 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4023 Sched<[sched.Folded, sched.ReadAfterFold]>;
4026 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4027 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
4028 i128mem, SchedWriteShuffle.XMM, load, 0>,
4030 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
4031 i128mem, SchedWriteShuffle.XMM, load, 0>,
4033 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
4034 i128mem, SchedWriteShuffle.XMM, load, 0>,
4036 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
4037 i128mem, SchedWriteShuffle.XMM, load, 0>,
4041 let Predicates = [HasAVX, NoVLX] in {
4042 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
4043 i128mem, SchedWriteShuffle.XMM, load, 0>,
4045 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
4046 i128mem, SchedWriteShuffle.XMM, load, 0>,
4048 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
4049 i128mem, SchedWriteShuffle.XMM, load, 0>,
4051 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
4052 i128mem, SchedWriteShuffle.XMM, load, 0>,
4056 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4057 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
4058 i256mem, SchedWriteShuffle.YMM, load, 0>,
4059 VEX_4V, VEX_L, VEX_WIG;
4060 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
4061 i256mem, SchedWriteShuffle.YMM, load, 0>,
4062 VEX_4V, VEX_L, VEX_WIG;
4063 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
4064 i256mem, SchedWriteShuffle.YMM, load, 0>,
4065 VEX_4V, VEX_L, VEX_WIG;
4066 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
4067 i256mem, SchedWriteShuffle.YMM, load, 0>,
4068 VEX_4V, VEX_L, VEX_WIG;
4071 let Predicates = [HasAVX2, NoVLX] in {
4072 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
4073 i256mem, SchedWriteShuffle.YMM, load, 0>,
4074 VEX_4V, VEX_L, VEX_WIG;
4075 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
4076 i256mem, SchedWriteShuffle.YMM, load, 0>,
4077 VEX_4V, VEX_L, VEX_WIG;
4078 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
4079 i256mem, SchedWriteShuffle.YMM, load, 0>,
4080 VEX_4V, VEX_L, VEX_WIG;
4081 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
4082 i256mem, SchedWriteShuffle.YMM, load, 0>,
4083 VEX_4V, VEX_L, VEX_WIG;
4086 let Constraints = "$src1 = $dst" in {
4087 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
4088 i128mem, SchedWriteShuffle.XMM, memop>;
4089 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
4090 i128mem, SchedWriteShuffle.XMM, memop>;
4091 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
4092 i128mem, SchedWriteShuffle.XMM, memop>;
4093 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
4094 i128mem, SchedWriteShuffle.XMM, memop>;
4096 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
4097 i128mem, SchedWriteShuffle.XMM, memop>;
4098 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
4099 i128mem, SchedWriteShuffle.XMM, memop>;
4100 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
4101 i128mem, SchedWriteShuffle.XMM, memop>;
4102 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
4103 i128mem, SchedWriteShuffle.XMM, memop>;
4105 } // ExeDomain = SSEPackedInt
4107 //===---------------------------------------------------------------------===//
4108 // SSE2 - Packed Integer Extract and Insert
4109 //===---------------------------------------------------------------------===//
4111 let ExeDomain = SSEPackedInt in {
4112 multiclass sse2_pinsrw<bit Is2Addr = 1> {
4113 def rr : Ii8<0xC4, MRMSrcReg,
4114 (outs VR128:$dst), (ins VR128:$src1,
4115 GR32orGR64:$src2, u8imm:$src3),
4117 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4118 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4120 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
4121 Sched<[WriteVecInsert]>;
4122 def rm : Ii8<0xC4, MRMSrcMem,
4123 (outs VR128:$dst), (ins VR128:$src1,
4124 i16mem:$src2, u8imm:$src3),
4126 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4127 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4129 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4131 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
4135 let Predicates = [HasAVX, NoBWI] in
4136 def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
4137 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4138 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4139 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4141 PD, VEX, Sched<[WriteVecExtract]>;
4142 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
4143 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4144 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4145 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4147 Sched<[WriteVecExtract]>;
4150 let Predicates = [HasAVX, NoBWI] in
4151 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
4153 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4154 defm PINSRW : sse2_pinsrw, PD;
4156 } // ExeDomain = SSEPackedInt
4158 //===---------------------------------------------------------------------===//
4159 // SSE2 - Packed Mask Creation
4160 //===---------------------------------------------------------------------===//
4162 let ExeDomain = SSEPackedInt in {
4164 def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4166 "pmovmskb\t{$src, $dst|$dst, $src}",
4167 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4168 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
4170 let Predicates = [HasAVX2] in {
4171 def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4173 "pmovmskb\t{$src, $dst|$dst, $src}",
4174 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
4175 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
4178 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4179 "pmovmskb\t{$src, $dst|$dst, $src}",
4180 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4181 Sched<[WriteVecMOVMSK]>;
4183 } // ExeDomain = SSEPackedInt
4185 //===---------------------------------------------------------------------===//
4186 // SSE2 - Conditional Store
4187 //===---------------------------------------------------------------------===//
4189 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4190 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4191 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4192 (ins VR128:$src, VR128:$mask),
4193 "maskmovdqu\t{$mask, $src|$src, $mask}",
4194 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4196 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4197 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4198 (ins VR128:$src, VR128:$mask),
4199 "maskmovdqu\t{$mask, $src|$src, $mask}",
4200 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4203 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4204 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4205 "maskmovdqu\t{$mask, $src|$src, $mask}",
4206 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4207 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4208 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4209 "maskmovdqu\t{$mask, $src|$src, $mask}",
4210 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4212 } // ExeDomain = SSEPackedInt
4214 //===---------------------------------------------------------------------===//
4215 // SSE2 - Move Doubleword/Quadword
4216 //===---------------------------------------------------------------------===//
4218 //===---------------------------------------------------------------------===//
4219 // Move Int Doubleword to Packed Double Int
4221 let ExeDomain = SSEPackedInt in {
4222 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4223 "movd\t{$src, $dst|$dst, $src}",
4225 (v4i32 (scalar_to_vector GR32:$src)))]>,
4226 VEX, Sched<[WriteVecMoveFromGpr]>;
4227 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4228 "movd\t{$src, $dst|$dst, $src}",
4230 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4231 VEX, Sched<[WriteVecLoad]>;
4232 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4233 "movq\t{$src, $dst|$dst, $src}",
4235 (v2i64 (scalar_to_vector GR64:$src)))]>,
4236 VEX, Sched<[WriteVecMoveFromGpr]>;
4237 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4238 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4239 "movq\t{$src, $dst|$dst, $src}", []>,
4240 VEX, Sched<[WriteVecLoad]>;
4241 let isCodeGenOnly = 1 in
4242 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4243 "movq\t{$src, $dst|$dst, $src}",
4244 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4245 VEX, Sched<[WriteVecMoveFromGpr]>;
4247 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4248 "movd\t{$src, $dst|$dst, $src}",
4250 (v4i32 (scalar_to_vector GR32:$src)))]>,
4251 Sched<[WriteVecMoveFromGpr]>;
4252 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4253 "movd\t{$src, $dst|$dst, $src}",
4255 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4256 Sched<[WriteVecLoad]>;
4257 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4258 "movq\t{$src, $dst|$dst, $src}",
4260 (v2i64 (scalar_to_vector GR64:$src)))]>,
4261 Sched<[WriteVecMoveFromGpr]>;
4262 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4263 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4264 "movq\t{$src, $dst|$dst, $src}", []>,
4265 Sched<[WriteVecLoad]>;
4266 let isCodeGenOnly = 1 in
4267 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4268 "movq\t{$src, $dst|$dst, $src}",
4269 [(set FR64:$dst, (bitconvert GR64:$src))]>,
4270 Sched<[WriteVecMoveFromGpr]>;
4271 } // ExeDomain = SSEPackedInt
4273 //===---------------------------------------------------------------------===//
4274 // Move Int Doubleword to Single Scalar
4276 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4277 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4278 "movd\t{$src, $dst|$dst, $src}",
4279 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4280 VEX, Sched<[WriteVecMoveFromGpr]>;
4282 def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4283 "movd\t{$src, $dst|$dst, $src}",
4284 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
4285 VEX, Sched<[WriteVecLoad]>;
4286 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4287 "movd\t{$src, $dst|$dst, $src}",
4288 [(set FR32:$dst, (bitconvert GR32:$src))]>,
4289 Sched<[WriteVecMoveFromGpr]>;
4291 def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4292 "movd\t{$src, $dst|$dst, $src}",
4293 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
4294 Sched<[WriteVecLoad]>;
4295 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4297 //===---------------------------------------------------------------------===//
4298 // Move Packed Doubleword Int to Packed Double Int
4300 let ExeDomain = SSEPackedInt in {
4301 def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4302 "movd\t{$src, $dst|$dst, $src}",
4303 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4305 Sched<[WriteVecMoveToGpr]>;
4306 def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
4307 (ins i32mem:$dst, VR128:$src),
4308 "movd\t{$src, $dst|$dst, $src}",
4309 [(store (i32 (extractelt (v4i32 VR128:$src),
4310 (iPTR 0))), addr:$dst)]>,
4311 VEX, Sched<[WriteVecStore]>;
4312 def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4313 "movd\t{$src, $dst|$dst, $src}",
4314 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4316 Sched<[WriteVecMoveToGpr]>;
4317 def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4318 "movd\t{$src, $dst|$dst, $src}",
4319 [(store (i32 (extractelt (v4i32 VR128:$src),
4320 (iPTR 0))), addr:$dst)]>,
4321 Sched<[WriteVecStore]>;
4322 } // ExeDomain = SSEPackedInt
4324 //===---------------------------------------------------------------------===//
4325 // Move Packed Doubleword Int first element to Doubleword Int
4327 let ExeDomain = SSEPackedInt in {
4328 let SchedRW = [WriteVecMoveToGpr] in {
4329 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4330 "movq\t{$src, $dst|$dst, $src}",
4331 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4335 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4336 "movq\t{$src, $dst|$dst, $src}",
4337 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4341 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4342 def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4343 (ins i64mem:$dst, VR128:$src),
4344 "movq\t{$src, $dst|$dst, $src}", []>,
4345 VEX, Sched<[WriteVecStore]>;
4346 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4347 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4348 "movq\t{$src, $dst|$dst, $src}", []>,
4349 Sched<[WriteVecStore]>;
4350 } // ExeDomain = SSEPackedInt
4352 //===---------------------------------------------------------------------===//
4353 // Bitcast FR64 <-> GR64
4355 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4356 let Predicates = [UseAVX] in
4357 def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4358 "movq\t{$src, $dst|$dst, $src}",
4359 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4360 VEX, Sched<[WriteVecLoad]>;
4361 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4362 "movq\t{$src, $dst|$dst, $src}",
4363 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4364 VEX, Sched<[WriteVecMoveToGpr]>;
4365 def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4366 "movq\t{$src, $dst|$dst, $src}",
4367 [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
4368 VEX, Sched<[WriteVecStore]>;
4370 def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4371 "movq\t{$src, $dst|$dst, $src}",
4372 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4373 Sched<[WriteVecLoad]>;
4374 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4375 "movq\t{$src, $dst|$dst, $src}",
4376 [(set GR64:$dst, (bitconvert FR64:$src))]>,
4377 Sched<[WriteVecMoveToGpr]>;
4378 def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4379 "movq\t{$src, $dst|$dst, $src}",
4380 [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
4381 Sched<[WriteVecStore]>;
4382 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4384 //===---------------------------------------------------------------------===//
4385 // Move Scalar Single to Double Int
4387 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4388 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4389 "movd\t{$src, $dst|$dst, $src}",
4390 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4391 VEX, Sched<[WriteVecMoveToGpr]>;
4392 def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4393 "movd\t{$src, $dst|$dst, $src}",
4394 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
4395 VEX, Sched<[WriteVecStore]>;
4396 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4397 "movd\t{$src, $dst|$dst, $src}",
4398 [(set GR32:$dst, (bitconvert FR32:$src))]>,
4399 Sched<[WriteVecMoveToGpr]>;
4400 def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4401 "movd\t{$src, $dst|$dst, $src}",
4402 [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>,
4403 Sched<[WriteVecStore]>;
4404 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4406 let Predicates = [UseAVX] in {
4407 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4408 (VMOVDI2PDIrr GR32:$src)>;
4410 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4411 (VMOV64toPQIrr GR64:$src)>;
4413 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4414 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4415 (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
4416 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4417 // These instructions also write zeros in the high part of a 256-bit register.
4418 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4419 (VMOVDI2PDIrm addr:$src)>;
4420 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4421 (VMOVDI2PDIrm addr:$src)>;
4422 def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
4423 (VMOVDI2PDIrm addr:$src)>;
4424 def : Pat<(v4i32 (X86vzload addr:$src)),
4425 (VMOVDI2PDIrm addr:$src)>;
4426 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4427 (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
4428 (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4429 def : Pat<(v8i32 (X86vzload addr:$src)),
4430 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4431 // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4432 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4433 (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4434 (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
4437 let Predicates = [UseSSE2] in {
4438 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4439 (MOVDI2PDIrr GR32:$src)>;
4441 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4442 (MOV64toPQIrr GR64:$src)>;
4443 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4444 (MOVDI2PDIrm addr:$src)>;
4445 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4446 (MOVDI2PDIrm addr:$src)>;
4447 def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
4448 (MOVDI2PDIrm addr:$src)>;
4449 def : Pat<(v4i32 (X86vzload addr:$src)),
4450 (MOVDI2PDIrm addr:$src)>;
4453 // Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4454 // "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4456 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4457 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4458 def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4459 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4460 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4461 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4462 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4463 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4464 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4466 //===---------------------------------------------------------------------===//
4467 // SSE2 - Move Quadword
4468 //===---------------------------------------------------------------------===//
4470 //===---------------------------------------------------------------------===//
4471 // Move Quadword Int to Packed Quadword Int
4474 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4475 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4476 "vmovq\t{$src, $dst|$dst, $src}",
4478 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4479 VEX, Requires<[UseAVX]>, VEX_WIG;
4480 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4481 "movq\t{$src, $dst|$dst, $src}",
4483 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4484 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4485 } // ExeDomain, SchedRW
4487 //===---------------------------------------------------------------------===//
4488 // Move Packed Quadword Int to Quadword Int
4490 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4491 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4492 "movq\t{$src, $dst|$dst, $src}",
4493 [(store (i64 (extractelt (v2i64 VR128:$src),
4494 (iPTR 0))), addr:$dst)]>,
4496 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4497 "movq\t{$src, $dst|$dst, $src}",
4498 [(store (i64 (extractelt (v2i64 VR128:$src),
4499 (iPTR 0))), addr:$dst)]>;
4500 } // ExeDomain, SchedRW
4502 // For disassembler only
4503 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4504 SchedRW = [SchedWriteVecLogic.XMM] in {
4505 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4506 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4507 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4508 "movq\t{$src, $dst|$dst, $src}", []>;
4511 // Aliases to help the assembler pick two byte VEX encodings by swapping the
4512 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
4513 def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
4514 (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
4516 def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4517 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4518 def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4519 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4521 let Predicates = [UseAVX] in {
4522 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4523 (VMOVQI2PQIrm addr:$src)>;
4524 def : Pat<(v2i64 (X86vzload addr:$src)),
4525 (VMOVQI2PQIrm addr:$src)>;
4526 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4527 (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
4528 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4529 def : Pat<(v4i64 (X86vzload addr:$src)),
4530 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4533 let Predicates = [UseSSE2] in {
4534 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4535 (MOVQI2PQIrm addr:$src)>;
4536 def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
4539 //===---------------------------------------------------------------------===//
4540 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4541 // IA32 document. movq xmm1, xmm2 does clear the high bits.
4543 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4544 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4545 "vmovq\t{$src, $dst|$dst, $src}",
4546 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4547 XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4548 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4549 "movq\t{$src, $dst|$dst, $src}",
4550 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4551 XS, Requires<[UseSSE2]>;
4552 } // ExeDomain, SchedRW
4554 let Predicates = [UseAVX] in {
4555 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4556 (VMOVZPQILo2PQIrr VR128:$src)>;
4558 let Predicates = [UseSSE2] in {
4559 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4560 (MOVZPQILo2PQIrr VR128:$src)>;
4563 //===---------------------------------------------------------------------===//
4564 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4565 //===---------------------------------------------------------------------===//
4567 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4568 ValueType vt, RegisterClass RC, PatFrag mem_frag,
4569 X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4570 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4571 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4572 [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4574 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4575 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4576 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4577 Sched<[sched.Folded]>;
4580 let Predicates = [HasAVX, NoVLX] in {
4581 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4582 v4f32, VR128, loadv4f32, f128mem,
4583 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4584 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4585 v4f32, VR128, loadv4f32, f128mem,
4586 SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4587 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4588 v8f32, VR256, loadv8f32, f256mem,
4589 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4590 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4591 v8f32, VR256, loadv8f32, f256mem,
4592 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4594 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4595 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4596 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4597 memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4599 let Predicates = [HasAVX, NoVLX] in {
4600 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4601 (VMOVSHDUPrr VR128:$src)>;
4602 def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4603 (VMOVSHDUPrm addr:$src)>;
4604 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4605 (VMOVSLDUPrr VR128:$src)>;
4606 def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4607 (VMOVSLDUPrm addr:$src)>;
4608 def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4609 (VMOVSHDUPYrr VR256:$src)>;
4610 def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4611 (VMOVSHDUPYrm addr:$src)>;
4612 def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4613 (VMOVSLDUPYrr VR256:$src)>;
4614 def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4615 (VMOVSLDUPYrm addr:$src)>;
4618 let Predicates = [UseSSE3] in {
4619 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4620 (MOVSHDUPrr VR128:$src)>;
4621 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4622 (MOVSHDUPrm addr:$src)>;
4623 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4624 (MOVSLDUPrr VR128:$src)>;
4625 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4626 (MOVSLDUPrm addr:$src)>;
4629 //===---------------------------------------------------------------------===//
4630 // SSE3 - Replicate Double FP - MOVDDUP
4631 //===---------------------------------------------------------------------===//
4633 multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4634 def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4635 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4636 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4638 def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4639 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4642 (scalar_to_vector (loadf64 addr:$src)))))]>,
4643 Sched<[sched.XMM.Folded]>;
4646 // FIXME: Merge with above classes when there are patterns for the ymm version
4647 multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4648 def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4649 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4650 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4652 def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4653 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4655 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4656 Sched<[sched.YMM.Folded]>;
4659 let Predicates = [HasAVX, NoVLX] in {
4660 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4662 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4663 VEX, VEX_L, VEX_WIG;
4666 defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4669 let Predicates = [HasAVX, NoVLX] in {
4670 def : Pat<(X86Movddup (loadv2f64 addr:$src)),
4671 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4672 def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
4673 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4676 let Predicates = [UseSSE3] in {
4677 // No need for aligned memory as this only loads 64-bits.
4678 def : Pat<(X86Movddup (loadv2f64 addr:$src)),
4679 (MOVDDUPrm addr:$src)>;
4680 def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
4681 (MOVDDUPrm addr:$src)>;
4684 //===---------------------------------------------------------------------===//
4685 // SSE3 - Move Unaligned Integer
4686 //===---------------------------------------------------------------------===//
4688 let Predicates = [HasAVX] in {
4689 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4690 "vlddqu\t{$src, $dst|$dst, $src}",
4691 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4692 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4693 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4694 "vlddqu\t{$src, $dst|$dst, $src}",
4695 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4696 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4699 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4700 "lddqu\t{$src, $dst|$dst, $src}",
4701 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4702 Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4704 //===---------------------------------------------------------------------===//
4705 // SSE3 - Arithmetic
4706 //===---------------------------------------------------------------------===//
4708 multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4709 X86MemOperand x86memop, X86FoldableSchedWrite sched,
4710 PatFrag ld_frag, bit Is2Addr = 1> {
4711 def rr : I<0xD0, MRMSrcReg,
4712 (outs RC:$dst), (ins RC:$src1, RC:$src2),
4714 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4715 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4716 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4718 def rm : I<0xD0, MRMSrcMem,
4719 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4721 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4722 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4723 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4724 Sched<[sched.Folded, sched.ReadAfterFold]>;
4727 let Predicates = [HasAVX] in {
4728 let ExeDomain = SSEPackedSingle in {
4729 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4730 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4731 XD, VEX_4V, VEX_WIG;
4732 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4733 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4734 XD, VEX_4V, VEX_L, VEX_WIG;
4736 let ExeDomain = SSEPackedDouble in {
4737 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4738 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4739 PD, VEX_4V, VEX_WIG;
4740 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4741 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4742 PD, VEX_4V, VEX_L, VEX_WIG;
4745 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4746 let ExeDomain = SSEPackedSingle in
4747 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4748 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4749 let ExeDomain = SSEPackedDouble in
4750 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4751 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4754 //===---------------------------------------------------------------------===//
4755 // SSE3 Instructions
4756 //===---------------------------------------------------------------------===//
4759 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4760 X86MemOperand x86memop, SDNode OpNode,
4761 X86FoldableSchedWrite sched, PatFrag ld_frag,
4763 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4765 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4766 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4767 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4770 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4772 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4773 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4774 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4775 Sched<[sched.Folded, sched.ReadAfterFold]>;
4777 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4778 X86MemOperand x86memop, SDNode OpNode,
4779 X86FoldableSchedWrite sched, PatFrag ld_frag,
4781 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4783 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4784 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4785 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4788 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4790 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4791 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4792 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4793 Sched<[sched.Folded, sched.ReadAfterFold]>;
4796 let Predicates = [HasAVX] in {
4797 let ExeDomain = SSEPackedSingle in {
4798 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4799 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4800 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4801 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4802 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4803 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4804 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4805 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4807 let ExeDomain = SSEPackedDouble in {
4808 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4809 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4810 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4811 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4812 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4813 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4814 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4815 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4819 let Constraints = "$src1 = $dst" in {
4820 let ExeDomain = SSEPackedSingle in {
4821 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4822 WriteFHAdd, memopv4f32>;
4823 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4824 WriteFHAdd, memopv4f32>;
4826 let ExeDomain = SSEPackedDouble in {
4827 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4828 WriteFHAdd, memopv2f64>;
4829 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4830 WriteFHAdd, memopv2f64>;
4834 //===---------------------------------------------------------------------===//
4835 // SSSE3 - Packed Absolute Instructions
4836 //===---------------------------------------------------------------------===//
4838 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4839 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4840 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4841 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4843 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4844 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4847 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4849 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4851 (vt (OpNode (ld_frag addr:$src))))]>,
4852 Sched<[sched.XMM.Folded]>;
4855 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4856 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4857 SDNode OpNode, X86SchedWriteWidths sched> {
4858 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4860 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4861 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4864 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4866 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4868 (vt (OpNode (load addr:$src))))]>,
4869 Sched<[sched.YMM.Folded]>;
4872 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4873 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4874 load>, VEX, VEX_WIG;
4875 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4876 load>, VEX, VEX_WIG;
4878 let Predicates = [HasAVX, NoVLX] in {
4879 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4880 load>, VEX, VEX_WIG;
4882 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4883 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4884 VEX, VEX_L, VEX_WIG;
4885 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4886 VEX, VEX_L, VEX_WIG;
4888 let Predicates = [HasAVX2, NoVLX] in {
4889 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4890 VEX, VEX_L, VEX_WIG;
4893 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4895 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4897 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4900 //===---------------------------------------------------------------------===//
4901 // SSSE3 - Packed Binary Operator Instructions
4902 //===---------------------------------------------------------------------===//
4904 /// SS3I_binop_rm - Simple SSSE3 bin op
4905 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4906 ValueType DstVT, ValueType OpVT, RegisterClass RC,
4907 PatFrag memop_frag, X86MemOperand x86memop,
4908 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4909 let isCommutable = 1 in
4910 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4911 (ins RC:$src1, RC:$src2),
4913 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4914 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4915 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4917 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4918 (ins RC:$src1, x86memop:$src2),
4920 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4921 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4923 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4924 Sched<[sched.Folded, sched.ReadAfterFold]>;
4927 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4928 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4929 Intrinsic IntId128, X86FoldableSchedWrite sched,
4930 PatFrag ld_frag, bit Is2Addr = 1> {
4931 let isCommutable = 1 in
4932 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4933 (ins VR128:$src1, VR128:$src2),
4935 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4936 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4937 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4939 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4940 (ins VR128:$src1, i128mem:$src2),
4942 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4943 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4945 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4946 Sched<[sched.Folded, sched.ReadAfterFold]>;
4949 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4951 X86FoldableSchedWrite sched> {
4952 let isCommutable = 1 in
4953 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4954 (ins VR256:$src1, VR256:$src2),
4955 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4956 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4958 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4959 (ins VR256:$src1, i256mem:$src2),
4960 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4962 (IntId256 VR256:$src1, (load addr:$src2)))]>,
4963 Sched<[sched.Folded, sched.ReadAfterFold]>;
4966 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4967 let isCommutable = 0 in {
4968 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4969 VR128, load, i128mem,
4970 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4971 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4972 v16i8, VR128, load, i128mem,
4973 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4975 defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4976 VR128, load, i128mem,
4977 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4980 let ImmT = NoImm, Predicates = [HasAVX] in {
4981 let isCommutable = 0 in {
4982 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4984 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4985 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4987 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4988 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4990 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4991 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4993 SchedWritePHAdd.XMM, 0>, VEX_4V;
4994 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
4995 int_x86_ssse3_psign_b_128,
4996 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4997 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
4998 int_x86_ssse3_psign_w_128,
4999 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
5000 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
5001 int_x86_ssse3_psign_d_128,
5002 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
5003 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
5004 int_x86_ssse3_phadd_sw_128,
5005 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
5006 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
5007 int_x86_ssse3_phsub_sw_128,
5008 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
5012 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5013 let isCommutable = 0 in {
5014 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
5015 VR256, load, i256mem,
5016 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
5017 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
5018 v32i8, VR256, load, i256mem,
5019 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
5021 defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
5022 VR256, load, i256mem,
5023 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
5026 let ImmT = NoImm, Predicates = [HasAVX2] in {
5027 let isCommutable = 0 in {
5028 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
5029 VR256, load, i256mem,
5030 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
5031 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
5033 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
5034 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
5035 VR256, load, i256mem,
5036 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
5037 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
5039 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
5040 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
5041 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
5042 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
5043 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
5044 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
5045 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
5046 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
5047 int_x86_avx2_phadd_sw,
5048 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
5049 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
5050 int_x86_avx2_phsub_sw,
5051 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
5055 // None of these have i8 immediate fields.
5056 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
5057 let isCommutable = 0 in {
5058 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
5059 memop, i128mem, SchedWritePHAdd.XMM>;
5060 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
5061 memop, i128mem, SchedWritePHAdd.XMM>;
5062 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
5063 memop, i128mem, SchedWritePHAdd.XMM>;
5064 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
5065 memop, i128mem, SchedWritePHAdd.XMM>;
5066 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
5067 SchedWriteVecALU.XMM, memop>;
5068 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
5069 SchedWriteVecALU.XMM, memop>;
5070 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
5071 SchedWriteVecALU.XMM, memop>;
5072 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
5073 memop, i128mem, SchedWriteVarShuffle.XMM>;
5074 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
5075 int_x86_ssse3_phadd_sw_128,
5076 SchedWritePHAdd.XMM, memop>;
5077 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
5078 int_x86_ssse3_phsub_sw_128,
5079 SchedWritePHAdd.XMM, memop>;
5080 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
5081 v16i8, VR128, memop, i128mem,
5082 SchedWriteVecIMul.XMM>;
5084 defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
5085 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
5088 //===---------------------------------------------------------------------===//
5089 // SSSE3 - Packed Align Instruction Patterns
5090 //===---------------------------------------------------------------------===//
5092 multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
5093 PatFrag memop_frag, X86MemOperand x86memop,
5094 X86FoldableSchedWrite sched, bit Is2Addr = 1> {
5095 let hasSideEffects = 0 in {
5096 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
5097 (ins RC:$src1, RC:$src2, u8imm:$src3),
5099 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5101 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5102 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
5105 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
5106 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5108 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5110 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5111 [(set RC:$dst, (VT (X86PAlignr RC:$src1,
5112 (memop_frag addr:$src2),
5113 (i8 imm:$src3))))]>,
5114 Sched<[sched.Folded, sched.ReadAfterFold]>;
5118 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
5119 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
5120 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
5121 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
5122 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
5123 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
5124 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
5125 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
5126 SchedWriteShuffle.XMM>;
5128 //===---------------------------------------------------------------------===//
5129 // SSSE3 - Thread synchronization
5130 //===---------------------------------------------------------------------===//
5132 let SchedRW = [WriteSystem] in {
5133 let usesCustomInserter = 1 in {
5134 def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
5135 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
5136 Requires<[HasSSE3]>;
5139 let Uses = [EAX, ECX, EDX] in
5140 def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
5141 TB, Requires<[HasSSE3]>;
5143 let Uses = [ECX, EAX] in
5144 def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
5145 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
5148 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
5149 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
5151 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
5152 Requires<[Not64BitMode]>;
5153 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
5154 Requires<[In64BitMode]>;
5156 //===----------------------------------------------------------------------===//
5157 // SSE4.1 - Packed Move with Sign/Zero Extend
5158 //===----------------------------------------------------------------------===//
5160 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5161 RegisterClass OutRC, RegisterClass InRC,
5162 X86FoldableSchedWrite sched> {
5163 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
5164 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5167 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
5168 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5169 Sched<[sched.Folded]>;
5172 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
5173 X86MemOperand MemOp, X86MemOperand MemYOp,
5175 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
5176 SchedWriteShuffle.XMM>;
5177 let Predicates = [HasAVX, prd] in
5178 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5179 VR128, VR128, SchedWriteShuffle.XMM>,
5181 let Predicates = [HasAVX2, prd] in
5182 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5183 VR256, VR128, WriteShuffle256>,
5184 VEX, VEX_L, VEX_WIG;
5187 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5188 X86MemOperand MemYOp, Predicate prd> {
5189 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5190 MemOp, MemYOp, prd>;
5191 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5192 !strconcat("pmovzx", OpcodeStr),
5193 MemOp, MemYOp, prd>;
5196 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
5197 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
5198 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
5200 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
5201 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
5203 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
5205 // Patterns that we also need for any_extend.
5206 // Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg.
5207 multiclass SS41I_pmovx_avx2_patterns_base<string OpcPrefix, SDNode ExtOp> {
5208 // Register-Register patterns
5209 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5210 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5211 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5214 let Predicates = [HasAVX2, NoVLX] in {
5215 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5216 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5218 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5219 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5222 // AVX2 Register-Memory patterns
5223 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5224 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5225 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5226 def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5227 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5228 def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5229 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5232 let Predicates = [HasAVX2, NoVLX] in {
5233 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5234 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5235 def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5236 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5237 def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5238 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5240 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5241 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5242 def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
5243 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5244 def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
5245 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5250 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
5251 SDNode ExtOp, SDNode InVecOp> :
5252 SS41I_pmovx_avx2_patterns_base<OpcPrefix, ExtOp> {
5254 // Register-Register patterns
5255 let Predicates = [HasAVX2, NoVLX] in {
5256 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
5257 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5258 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
5259 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5261 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
5262 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5265 // Simple Register-Memory patterns
5266 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5267 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5268 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5270 let Predicates = [HasAVX2, NoVLX] in {
5271 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5272 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5273 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5274 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5276 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5277 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5278 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5279 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5281 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5282 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5285 // AVX2 Register-Memory patterns
5286 let Predicates = [HasAVX2, NoVLX] in {
5287 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5288 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5289 def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5290 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5291 def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
5292 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5293 def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))),
5294 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5296 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5297 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5298 def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5299 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5300 def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
5301 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5302 def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))),
5303 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5305 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5306 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5307 def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5308 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5309 def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
5310 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5311 def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))),
5312 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5316 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5317 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5318 defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>;
5320 // SSE4.1/AVX patterns.
5321 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5323 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5324 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5325 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5327 let Predicates = [HasAVX, NoVLX] in {
5328 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5329 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5330 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5331 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5333 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5334 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5335 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5336 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5338 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5339 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5341 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5342 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5343 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5345 let Predicates = [HasAVX, NoVLX] in {
5346 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5347 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5348 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5349 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5351 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5352 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5353 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5354 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5356 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5357 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5359 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5360 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5361 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5362 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5363 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5364 def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5365 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5366 def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5367 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5368 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5369 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5371 let Predicates = [HasAVX, NoVLX] in {
5372 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5373 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5374 def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5375 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5376 def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5377 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5378 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5379 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5381 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5382 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5383 def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5384 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5385 def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5386 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5387 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5388 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5390 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5391 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5392 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5393 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5394 def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5395 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5396 def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5397 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5398 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5399 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5401 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5402 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5403 def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
5404 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5405 def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5406 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5407 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5408 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5410 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5411 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5412 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5413 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5414 def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
5415 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5416 def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
5417 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5418 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5419 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5423 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5424 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5426 let Predicates = [UseSSE41] in {
5427 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5428 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5431 //===----------------------------------------------------------------------===//
5432 // SSE4.1 - Extract Instructions
5433 //===----------------------------------------------------------------------===//
5435 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5436 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5437 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5438 (ins VR128:$src1, u8imm:$src2),
5439 !strconcat(OpcodeStr,
5440 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5441 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5443 Sched<[WriteVecExtract]>;
5444 let hasSideEffects = 0, mayStore = 1 in
5445 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5446 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5447 !strconcat(OpcodeStr,
5448 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5449 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5450 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5453 let Predicates = [HasAVX, NoBWI] in
5454 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
5456 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
5459 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5460 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5461 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5462 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5463 (ins VR128:$src1, u8imm:$src2),
5464 !strconcat(OpcodeStr,
5465 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5466 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5468 let hasSideEffects = 0, mayStore = 1 in
5469 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5470 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5471 !strconcat(OpcodeStr,
5472 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5473 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5474 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5477 let Predicates = [HasAVX, NoBWI] in
5478 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
5480 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
5483 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5484 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5485 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5486 (ins VR128:$src1, u8imm:$src2),
5487 !strconcat(OpcodeStr,
5488 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5490 (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5491 Sched<[WriteVecExtract]>;
5492 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5493 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5494 !strconcat(OpcodeStr,
5495 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5496 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5497 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5500 let Predicates = [HasAVX, NoDQI] in
5501 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5503 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
5505 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5506 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5507 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5508 (ins VR128:$src1, u8imm:$src2),
5509 !strconcat(OpcodeStr,
5510 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5512 (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5513 Sched<[WriteVecExtract]>;
5514 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5515 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5516 !strconcat(OpcodeStr,
5517 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5518 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5519 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5522 let Predicates = [HasAVX, NoDQI] in
5523 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5525 defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;
5527 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5529 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5530 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5531 (ins VR128:$src1, u8imm:$src2),
5532 !strconcat(OpcodeStr,
5533 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5534 [(set GR32orGR64:$dst,
5535 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5536 Sched<[WriteVecExtract]>;
5537 def mr : SS4AIi8<opc, MRMDestMem, (outs),
5538 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5539 !strconcat(OpcodeStr,
5540 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5541 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5542 addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5545 let ExeDomain = SSEPackedSingle in {
5546 let Predicates = [UseAVX] in
5547 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5548 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
5551 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
5552 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
5555 (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
5557 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
5560 (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
5561 Requires<[UseSSE41]>;
5563 //===----------------------------------------------------------------------===//
5564 // SSE4.1 - Insert Instructions
5565 //===----------------------------------------------------------------------===//
5567 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5568 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5569 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5571 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5573 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5575 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5576 Sched<[WriteVecInsert]>;
5577 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5578 (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5580 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5582 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5584 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
5585 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5588 let Predicates = [HasAVX, NoBWI] in
5589 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
5590 let Constraints = "$src1 = $dst" in
5591 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
5593 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5594 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5595 (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5597 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5599 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5601 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5602 Sched<[WriteVecInsert]>;
5603 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5604 (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5606 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5608 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5610 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5611 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5614 let Predicates = [HasAVX, NoDQI] in
5615 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5616 let Constraints = "$src1 = $dst" in
5617 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5619 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5620 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5621 (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5623 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5625 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5627 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5628 Sched<[WriteVecInsert]>;
5629 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5630 (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5632 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5634 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5636 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5637 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5640 let Predicates = [HasAVX, NoDQI] in
5641 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5642 let Constraints = "$src1 = $dst" in
5643 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5645 // insertps has a few different modes, there's the first two here below which
5646 // are optimized inserts that won't zero arbitrary elements in the destination
5647 // vector. The next one matches the intrinsic and could zero arbitrary elements
5648 // in the target vector.
5649 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5650 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5651 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5653 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5655 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5657 (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
5658 Sched<[SchedWriteFShuffle.XMM]>;
5659 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5660 (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5662 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5664 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5666 (X86insertps VR128:$src1,
5667 (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5669 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5672 let ExeDomain = SSEPackedSingle in {
5673 let Predicates = [UseAVX] in
5674 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5676 let Constraints = "$src1 = $dst" in
5677 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5680 let Predicates = [UseAVX] in {
5681 // If we're inserting an element from a vbroadcast of a load, fold the
5682 // load into the X86insertps instruction.
5683 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
5684 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
5685 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
5686 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
5687 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
5688 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
5691 //===----------------------------------------------------------------------===//
5692 // SSE4.1 - Round Instructions
5693 //===----------------------------------------------------------------------===//
5695 multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5696 X86MemOperand x86memop, RegisterClass RC,
5697 ValueType VT, PatFrag mem_frag, SDNode OpNode,
5698 X86FoldableSchedWrite sched> {
5699 // Intrinsic operation, reg.
5700 // Vector intrinsic operation, reg
5701 def r : SS4AIi8<opc, MRMSrcReg,
5702 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5703 !strconcat(OpcodeStr,
5704 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5705 [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
5708 // Vector intrinsic operation, mem
5709 def m : SS4AIi8<opc, MRMSrcMem,
5710 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5711 !strconcat(OpcodeStr,
5712 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5714 (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
5715 Sched<[sched.Folded]>;
5718 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5719 string OpcodeStr, X86FoldableSchedWrite sched> {
5720 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5721 def SSr : SS4AIi8<opcss, MRMSrcReg,
5722 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5723 !strconcat(OpcodeStr,
5724 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5725 []>, Sched<[sched]>;
5728 def SSm : SS4AIi8<opcss, MRMSrcMem,
5729 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5730 !strconcat(OpcodeStr,
5731 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5732 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5733 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5735 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5736 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5737 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5738 !strconcat(OpcodeStr,
5739 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5740 []>, Sched<[sched]>;
5743 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5744 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5745 !strconcat(OpcodeStr,
5746 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5747 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5748 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5751 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5752 string OpcodeStr, X86FoldableSchedWrite sched> {
5753 let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5754 def SSr : SS4AIi8<opcss, MRMSrcReg,
5755 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5756 !strconcat(OpcodeStr,
5757 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5758 []>, Sched<[sched]>;
5761 def SSm : SS4AIi8<opcss, MRMSrcMem,
5762 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5763 !strconcat(OpcodeStr,
5764 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5765 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5766 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5768 let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5769 def SDr : SS4AIi8<opcsd, MRMSrcReg,
5770 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5771 !strconcat(OpcodeStr,
5772 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5773 []>, Sched<[sched]>;
5776 def SDm : SS4AIi8<opcsd, MRMSrcMem,
5777 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5778 !strconcat(OpcodeStr,
5779 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5780 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5781 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5784 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5785 string OpcodeStr, X86FoldableSchedWrite sched,
5786 ValueType VT32, ValueType VT64,
5787 SDNode OpNode, bit Is2Addr = 1> {
5788 let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
5789 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5790 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5792 !strconcat(OpcodeStr,
5793 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5794 !strconcat(OpcodeStr,
5795 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5796 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5799 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5800 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5802 !strconcat(OpcodeStr,
5803 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5804 !strconcat(OpcodeStr,
5805 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5807 (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
5808 Sched<[sched.Folded, sched.ReadAfterFold]>;
5809 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5811 let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
5812 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5813 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5815 !strconcat(OpcodeStr,
5816 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5817 !strconcat(OpcodeStr,
5818 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5819 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5822 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5823 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5825 !strconcat(OpcodeStr,
5826 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5827 !strconcat(OpcodeStr,
5828 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5830 (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
5831 Sched<[sched.Folded, sched.ReadAfterFold]>;
5832 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5835 // FP round - roundss, roundps, roundsd, roundpd
5836 let Predicates = [HasAVX, NoVLX] in {
5837 let ExeDomain = SSEPackedSingle in {
5839 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5840 loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
5842 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5843 loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
5844 VEX, VEX_L, VEX_WIG;
5847 let ExeDomain = SSEPackedDouble in {
5848 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5849 loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
5851 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5852 loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
5853 VEX, VEX_L, VEX_WIG;
5856 let Predicates = [HasAVX, NoAVX512] in {
5857 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5858 v4f32, v2f64, X86RndScales, 0>,
5859 VEX_4V, VEX_LIG, VEX_WIG;
5860 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5861 VEX_4V, VEX_LIG, VEX_WIG;
5864 let Predicates = [UseAVX] in {
5865 def : Pat<(ffloor FR32:$src),
5866 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
5867 def : Pat<(f32 (fnearbyint FR32:$src)),
5868 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
5869 def : Pat<(f32 (fceil FR32:$src)),
5870 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
5871 def : Pat<(f32 (frint FR32:$src)),
5872 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
5873 def : Pat<(f32 (ftrunc FR32:$src)),
5874 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
5876 def : Pat<(f64 (ffloor FR64:$src)),
5877 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
5878 def : Pat<(f64 (fnearbyint FR64:$src)),
5879 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
5880 def : Pat<(f64 (fceil FR64:$src)),
5881 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
5882 def : Pat<(f64 (frint FR64:$src)),
5883 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
5884 def : Pat<(f64 (ftrunc FR64:$src)),
5885 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
5888 let Predicates = [UseAVX, OptForSize] in {
5889 def : Pat<(ffloor (loadf32 addr:$src)),
5890 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
5891 def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
5892 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
5893 def : Pat<(f32 (fceil (loadf32 addr:$src))),
5894 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
5895 def : Pat<(f32 (frint (loadf32 addr:$src))),
5896 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
5897 def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
5898 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
5900 def : Pat<(f64 (ffloor (loadf64 addr:$src))),
5901 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>;
5902 def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
5903 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>;
5904 def : Pat<(f64 (fceil (loadf64 addr:$src))),
5905 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>;
5906 def : Pat<(f64 (frint (loadf64 addr:$src))),
5907 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>;
5908 def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
5909 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>;
5912 let Predicates = [HasAVX, NoVLX] in {
5913 def : Pat<(v4f32 (ffloor VR128:$src)),
5914 (VROUNDPSr VR128:$src, (i32 0x9))>;
5915 def : Pat<(v4f32 (fnearbyint VR128:$src)),
5916 (VROUNDPSr VR128:$src, (i32 0xC))>;
5917 def : Pat<(v4f32 (fceil VR128:$src)),
5918 (VROUNDPSr VR128:$src, (i32 0xA))>;
5919 def : Pat<(v4f32 (frint VR128:$src)),
5920 (VROUNDPSr VR128:$src, (i32 0x4))>;
5921 def : Pat<(v4f32 (ftrunc VR128:$src)),
5922 (VROUNDPSr VR128:$src, (i32 0xB))>;
5924 def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))),
5925 (VROUNDPSm addr:$src, (i32 0x9))>;
5926 def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))),
5927 (VROUNDPSm addr:$src, (i32 0xC))>;
5928 def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))),
5929 (VROUNDPSm addr:$src, (i32 0xA))>;
5930 def : Pat<(v4f32 (frint (loadv4f32 addr:$src))),
5931 (VROUNDPSm addr:$src, (i32 0x4))>;
5932 def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))),
5933 (VROUNDPSm addr:$src, (i32 0xB))>;
5935 def : Pat<(v2f64 (ffloor VR128:$src)),
5936 (VROUNDPDr VR128:$src, (i32 0x9))>;
5937 def : Pat<(v2f64 (fnearbyint VR128:$src)),
5938 (VROUNDPDr VR128:$src, (i32 0xC))>;
5939 def : Pat<(v2f64 (fceil VR128:$src)),
5940 (VROUNDPDr VR128:$src, (i32 0xA))>;
5941 def : Pat<(v2f64 (frint VR128:$src)),
5942 (VROUNDPDr VR128:$src, (i32 0x4))>;
5943 def : Pat<(v2f64 (ftrunc VR128:$src)),
5944 (VROUNDPDr VR128:$src, (i32 0xB))>;
5946 def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))),
5947 (VROUNDPDm addr:$src, (i32 0x9))>;
5948 def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))),
5949 (VROUNDPDm addr:$src, (i32 0xC))>;
5950 def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))),
5951 (VROUNDPDm addr:$src, (i32 0xA))>;
5952 def : Pat<(v2f64 (frint (loadv2f64 addr:$src))),
5953 (VROUNDPDm addr:$src, (i32 0x4))>;
5954 def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))),
5955 (VROUNDPDm addr:$src, (i32 0xB))>;
5957 def : Pat<(v8f32 (ffloor VR256:$src)),
5958 (VROUNDPSYr VR256:$src, (i32 0x9))>;
5959 def : Pat<(v8f32 (fnearbyint VR256:$src)),
5960 (VROUNDPSYr VR256:$src, (i32 0xC))>;
5961 def : Pat<(v8f32 (fceil VR256:$src)),
5962 (VROUNDPSYr VR256:$src, (i32 0xA))>;
5963 def : Pat<(v8f32 (frint VR256:$src)),
5964 (VROUNDPSYr VR256:$src, (i32 0x4))>;
5965 def : Pat<(v8f32 (ftrunc VR256:$src)),
5966 (VROUNDPSYr VR256:$src, (i32 0xB))>;
5968 def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))),
5969 (VROUNDPSYm addr:$src, (i32 0x9))>;
5970 def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))),
5971 (VROUNDPSYm addr:$src, (i32 0xC))>;
5972 def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))),
5973 (VROUNDPSYm addr:$src, (i32 0xA))>;
5974 def : Pat<(v8f32 (frint (loadv8f32 addr:$src))),
5975 (VROUNDPSYm addr:$src, (i32 0x4))>;
5976 def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))),
5977 (VROUNDPSYm addr:$src, (i32 0xB))>;
5979 def : Pat<(v4f64 (ffloor VR256:$src)),
5980 (VROUNDPDYr VR256:$src, (i32 0x9))>;
5981 def : Pat<(v4f64 (fnearbyint VR256:$src)),
5982 (VROUNDPDYr VR256:$src, (i32 0xC))>;
5983 def : Pat<(v4f64 (fceil VR256:$src)),
5984 (VROUNDPDYr VR256:$src, (i32 0xA))>;
5985 def : Pat<(v4f64 (frint VR256:$src)),
5986 (VROUNDPDYr VR256:$src, (i32 0x4))>;
5987 def : Pat<(v4f64 (ftrunc VR256:$src)),
5988 (VROUNDPDYr VR256:$src, (i32 0xB))>;
5990 def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))),
5991 (VROUNDPDYm addr:$src, (i32 0x9))>;
5992 def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))),
5993 (VROUNDPDYm addr:$src, (i32 0xC))>;
5994 def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))),
5995 (VROUNDPDYm addr:$src, (i32 0xA))>;
5996 def : Pat<(v4f64 (frint (loadv4f64 addr:$src))),
5997 (VROUNDPDYm addr:$src, (i32 0x4))>;
5998 def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))),
5999 (VROUNDPDYm addr:$src, (i32 0xB))>;
6002 let ExeDomain = SSEPackedSingle in
6003 defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
6004 memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
6005 let ExeDomain = SSEPackedDouble in
6006 defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
6007 memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
6009 defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
6011 let Constraints = "$src1 = $dst" in
6012 defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
6013 v4f32, v2f64, X86RndScales>;
6015 let Predicates = [UseSSE41] in {
6016 def : Pat<(ffloor FR32:$src),
6017 (ROUNDSSr FR32:$src, (i32 0x9))>;
6018 def : Pat<(f32 (fnearbyint FR32:$src)),
6019 (ROUNDSSr FR32:$src, (i32 0xC))>;
6020 def : Pat<(f32 (fceil FR32:$src)),
6021 (ROUNDSSr FR32:$src, (i32 0xA))>;
6022 def : Pat<(f32 (frint FR32:$src)),
6023 (ROUNDSSr FR32:$src, (i32 0x4))>;
6024 def : Pat<(f32 (ftrunc FR32:$src)),
6025 (ROUNDSSr FR32:$src, (i32 0xB))>;
6027 def : Pat<(f64 (ffloor FR64:$src)),
6028 (ROUNDSDr FR64:$src, (i32 0x9))>;
6029 def : Pat<(f64 (fnearbyint FR64:$src)),
6030 (ROUNDSDr FR64:$src, (i32 0xC))>;
6031 def : Pat<(f64 (fceil FR64:$src)),
6032 (ROUNDSDr FR64:$src, (i32 0xA))>;
6033 def : Pat<(f64 (frint FR64:$src)),
6034 (ROUNDSDr FR64:$src, (i32 0x4))>;
6035 def : Pat<(f64 (ftrunc FR64:$src)),
6036 (ROUNDSDr FR64:$src, (i32 0xB))>;
6039 let Predicates = [UseSSE41, OptForSize] in {
6040 def : Pat<(ffloor (loadf32 addr:$src)),
6041 (ROUNDSSm addr:$src, (i32 0x9))>;
6042 def : Pat<(f32 (fnearbyint (loadf32 addr:$src))),
6043 (ROUNDSSm addr:$src, (i32 0xC))>;
6044 def : Pat<(f32 (fceil (loadf32 addr:$src))),
6045 (ROUNDSSm addr:$src, (i32 0xA))>;
6046 def : Pat<(f32 (frint (loadf32 addr:$src))),
6047 (ROUNDSSm addr:$src, (i32 0x4))>;
6048 def : Pat<(f32 (ftrunc (loadf32 addr:$src))),
6049 (ROUNDSSm addr:$src, (i32 0xB))>;
6051 def : Pat<(f64 (ffloor (loadf64 addr:$src))),
6052 (ROUNDSDm addr:$src, (i32 0x9))>;
6053 def : Pat<(f64 (fnearbyint (loadf64 addr:$src))),
6054 (ROUNDSDm addr:$src, (i32 0xC))>;
6055 def : Pat<(f64 (fceil (loadf64 addr:$src))),
6056 (ROUNDSDm addr:$src, (i32 0xA))>;
6057 def : Pat<(f64 (frint (loadf64 addr:$src))),
6058 (ROUNDSDm addr:$src, (i32 0x4))>;
6059 def : Pat<(f64 (ftrunc (loadf64 addr:$src))),
6060 (ROUNDSDm addr:$src, (i32 0xB))>;
6063 let Predicates = [UseSSE41] in {
6064 def : Pat<(v4f32 (ffloor VR128:$src)),
6065 (ROUNDPSr VR128:$src, (i32 0x9))>;
6066 def : Pat<(v4f32 (fnearbyint VR128:$src)),
6067 (ROUNDPSr VR128:$src, (i32 0xC))>;
6068 def : Pat<(v4f32 (fceil VR128:$src)),
6069 (ROUNDPSr VR128:$src, (i32 0xA))>;
6070 def : Pat<(v4f32 (frint VR128:$src)),
6071 (ROUNDPSr VR128:$src, (i32 0x4))>;
6072 def : Pat<(v4f32 (ftrunc VR128:$src)),
6073 (ROUNDPSr VR128:$src, (i32 0xB))>;
6075 def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))),
6076 (ROUNDPSm addr:$src, (i32 0x9))>;
6077 def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))),
6078 (ROUNDPSm addr:$src, (i32 0xC))>;
6079 def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))),
6080 (ROUNDPSm addr:$src, (i32 0xA))>;
6081 def : Pat<(v4f32 (frint (memopv4f32 addr:$src))),
6082 (ROUNDPSm addr:$src, (i32 0x4))>;
6083 def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))),
6084 (ROUNDPSm addr:$src, (i32 0xB))>;
6086 def : Pat<(v2f64 (ffloor VR128:$src)),
6087 (ROUNDPDr VR128:$src, (i32 0x9))>;
6088 def : Pat<(v2f64 (fnearbyint VR128:$src)),
6089 (ROUNDPDr VR128:$src, (i32 0xC))>;
6090 def : Pat<(v2f64 (fceil VR128:$src)),
6091 (ROUNDPDr VR128:$src, (i32 0xA))>;
6092 def : Pat<(v2f64 (frint VR128:$src)),
6093 (ROUNDPDr VR128:$src, (i32 0x4))>;
6094 def : Pat<(v2f64 (ftrunc VR128:$src)),
6095 (ROUNDPDr VR128:$src, (i32 0xB))>;
6097 def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))),
6098 (ROUNDPDm addr:$src, (i32 0x9))>;
6099 def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))),
6100 (ROUNDPDm addr:$src, (i32 0xC))>;
6101 def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))),
6102 (ROUNDPDm addr:$src, (i32 0xA))>;
6103 def : Pat<(v2f64 (frint (memopv2f64 addr:$src))),
6104 (ROUNDPDm addr:$src, (i32 0x4))>;
6105 def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))),
6106 (ROUNDPDm addr:$src, (i32 0xB))>;
6109 defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
6110 v4f32, 0x01, UseSSE41>;
6111 defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
6112 v4f32, 0x02, UseSSE41>;
6113 defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
6114 v2f64, 0x01, UseSSE41>;
6115 defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
6116 v2f64, 0x02, UseSSE41>;
6118 //===----------------------------------------------------------------------===//
6119 // SSE4.1 - Packed Bit Test
6120 //===----------------------------------------------------------------------===//
6122 // ptest instruction we'll lower to this in X86ISelLowering primarily from
6123 // the intel intrinsic that corresponds to this.
6124 let Defs = [EFLAGS], Predicates = [HasAVX] in {
6125 def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6126 "vptest\t{$src2, $src1|$src1, $src2}",
6127 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6128 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
6129 def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6130 "vptest\t{$src2, $src1|$src1, $src2}",
6131 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
6132 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
6135 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
6136 "vptest\t{$src2, $src1|$src1, $src2}",
6137 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
6138 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
6139 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
6140 "vptest\t{$src2, $src1|$src1, $src2}",
6141 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
6142 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
6143 VEX, VEX_L, VEX_WIG;
6146 let Defs = [EFLAGS] in {
6147 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6148 "ptest\t{$src2, $src1|$src1, $src2}",
6149 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6150 Sched<[SchedWriteVecTest.XMM]>;
6151 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6152 "ptest\t{$src2, $src1|$src1, $src2}",
6153 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6154 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
6157 // The bit test instructions below are AVX only
6158 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
6159 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
6160 X86FoldableSchedWrite sched> {
6161 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
6162 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6163 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
6164 Sched<[sched]>, VEX;
6165 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
6166 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6167 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
6168 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
6171 let Defs = [EFLAGS], Predicates = [HasAVX] in {
6172 let ExeDomain = SSEPackedSingle in {
6173 defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
6174 SchedWriteFTest.XMM>;
6175 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
6176 SchedWriteFTest.YMM>, VEX_L;
6178 let ExeDomain = SSEPackedDouble in {
6179 defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
6180 SchedWriteFTest.XMM>;
6181 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
6182 SchedWriteFTest.YMM>, VEX_L;
6186 //===----------------------------------------------------------------------===//
6187 // SSE4.1 - Misc Instructions
6188 //===----------------------------------------------------------------------===//
6190 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
6191 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
6192 "popcnt{w}\t{$src, $dst|$dst, $src}",
6193 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
6194 Sched<[WritePOPCNT]>, OpSize16, XS;
6195 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
6196 "popcnt{w}\t{$src, $dst|$dst, $src}",
6197 [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
6198 (implicit EFLAGS)]>,
6199 Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
6201 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
6202 "popcnt{l}\t{$src, $dst|$dst, $src}",
6203 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
6204 Sched<[WritePOPCNT]>, OpSize32, XS;
6206 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
6207 "popcnt{l}\t{$src, $dst|$dst, $src}",
6208 [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
6209 (implicit EFLAGS)]>,
6210 Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
6212 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
6213 "popcnt{q}\t{$src, $dst|$dst, $src}",
6214 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
6215 Sched<[WritePOPCNT]>, XS;
6216 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
6217 "popcnt{q}\t{$src, $dst|$dst, $src}",
6218 [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
6219 (implicit EFLAGS)]>,
6220 Sched<[WritePOPCNT.Folded]>, XS;
6223 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
6224 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
6225 SDNode OpNode, PatFrag ld_frag,
6226 X86FoldableSchedWrite Sched> {
6227 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6229 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6230 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
6232 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6234 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6236 (v8i16 (OpNode (ld_frag addr:$src))))]>,
6237 Sched<[Sched.Folded]>;
6240 // PHMIN has the same profile as PSAD, thus we use the same scheduling
6241 // model, although the naming is misleading.
6242 let Predicates = [HasAVX] in
6243 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
6245 WritePHMINPOS>, VEX, VEX_WIG;
6246 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
6250 /// SS48I_binop_rm - Simple SSE41 binary operator.
6251 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6252 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6253 X86MemOperand x86memop, X86FoldableSchedWrite sched,
6255 let isCommutable = 1 in
6256 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6257 (ins RC:$src1, RC:$src2),
6259 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6260 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6261 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6263 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6264 (ins RC:$src1, x86memop:$src2),
6266 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6267 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6269 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6270 Sched<[sched.Folded, sched.ReadAfterFold]>;
6273 let Predicates = [HasAVX, NoVLX] in {
6274 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
6275 load, i128mem, SchedWriteVecALU.XMM, 0>,
6277 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
6278 load, i128mem, SchedWriteVecALU.XMM, 0>,
6280 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
6281 load, i128mem, SchedWriteVecALU.XMM, 0>,
6283 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
6284 load, i128mem, SchedWriteVecALU.XMM, 0>,
6286 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
6287 load, i128mem, SchedWriteVecIMul.XMM, 0>,
6290 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
6291 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
6292 load, i128mem, SchedWriteVecALU.XMM, 0>,
6294 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
6295 load, i128mem, SchedWriteVecALU.XMM, 0>,
6297 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
6298 load, i128mem, SchedWriteVecALU.XMM, 0>,
6300 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
6301 load, i128mem, SchedWriteVecALU.XMM, 0>,
6305 let Predicates = [HasAVX2, NoVLX] in {
6306 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
6307 load, i256mem, SchedWriteVecALU.YMM, 0>,
6308 VEX_4V, VEX_L, VEX_WIG;
6309 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
6310 load, i256mem, SchedWriteVecALU.YMM, 0>,
6311 VEX_4V, VEX_L, VEX_WIG;
6312 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
6313 load, i256mem, SchedWriteVecALU.YMM, 0>,
6314 VEX_4V, VEX_L, VEX_WIG;
6315 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
6316 load, i256mem, SchedWriteVecALU.YMM, 0>,
6317 VEX_4V, VEX_L, VEX_WIG;
6318 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
6319 load, i256mem, SchedWriteVecIMul.YMM, 0>,
6320 VEX_4V, VEX_L, VEX_WIG;
6322 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
6323 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
6324 load, i256mem, SchedWriteVecALU.YMM, 0>,
6325 VEX_4V, VEX_L, VEX_WIG;
6326 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
6327 load, i256mem, SchedWriteVecALU.YMM, 0>,
6328 VEX_4V, VEX_L, VEX_WIG;
6329 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
6330 load, i256mem, SchedWriteVecALU.YMM, 0>,
6331 VEX_4V, VEX_L, VEX_WIG;
6332 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
6333 load, i256mem, SchedWriteVecALU.YMM, 0>,
6334 VEX_4V, VEX_L, VEX_WIG;
6337 let Constraints = "$src1 = $dst" in {
6338 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
6339 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6340 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
6341 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6342 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
6343 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6344 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
6345 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6346 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
6347 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6348 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
6349 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6350 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
6351 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6352 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
6353 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6354 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
6355 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
6358 let Predicates = [HasAVX, NoVLX] in
6359 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
6360 load, i128mem, SchedWritePMULLD.XMM, 0>,
6362 let Predicates = [HasAVX] in
6363 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
6364 load, i128mem, SchedWriteVecALU.XMM, 0>,
6367 let Predicates = [HasAVX2, NoVLX] in
6368 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
6369 load, i256mem, SchedWritePMULLD.YMM, 0>,
6370 VEX_4V, VEX_L, VEX_WIG;
6371 let Predicates = [HasAVX2] in
6372 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
6373 load, i256mem, SchedWriteVecALU.YMM, 0>,
6374 VEX_4V, VEX_L, VEX_WIG;
6376 let Constraints = "$src1 = $dst" in {
6377 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
6378 memop, i128mem, SchedWritePMULLD.XMM, 1>;
6379 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
6380 memop, i128mem, SchedWriteVecALU.XMM, 1>;
6383 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
6384 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
6385 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
6386 X86MemOperand x86memop, bit Is2Addr,
6387 X86FoldableSchedWrite sched> {
6388 let isCommutable = 1 in
6389 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6390 (ins RC:$src1, RC:$src2, u8imm:$src3),
6392 !strconcat(OpcodeStr,
6393 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6394 !strconcat(OpcodeStr,
6395 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6396 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
6398 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6399 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6401 !strconcat(OpcodeStr,
6402 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6403 !strconcat(OpcodeStr,
6404 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6406 (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
6407 Sched<[sched.Folded, sched.ReadAfterFold]>;
6410 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
6411 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6412 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6413 X86MemOperand x86memop, bit Is2Addr,
6414 X86FoldableSchedWrite sched> {
6415 let isCommutable = 1 in
6416 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6417 (ins RC:$src1, RC:$src2, u8imm:$src3),
6419 !strconcat(OpcodeStr,
6420 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6421 !strconcat(OpcodeStr,
6422 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6423 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
6425 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6426 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6428 !strconcat(OpcodeStr,
6429 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6430 !strconcat(OpcodeStr,
6431 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6433 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
6434 Sched<[sched.Folded, sched.ReadAfterFold]>;
6437 def BlendCommuteImm2 : SDNodeXForm<imm, [{
6438 uint8_t Imm = N->getZExtValue() & 0x03;
6439 return getI8Imm(Imm ^ 0x03, SDLoc(N));
6442 def BlendCommuteImm4 : SDNodeXForm<imm, [{
6443 uint8_t Imm = N->getZExtValue() & 0x0f;
6444 return getI8Imm(Imm ^ 0x0f, SDLoc(N));
6447 def BlendCommuteImm8 : SDNodeXForm<imm, [{
6448 uint8_t Imm = N->getZExtValue() & 0xff;
6449 return getI8Imm(Imm ^ 0xff, SDLoc(N));
6452 let Predicates = [HasAVX] in {
6453 let isCommutable = 0 in {
6454 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6455 VR128, load, i128mem, 0,
6456 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6459 let ExeDomain = SSEPackedSingle in
6460 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6461 VR128, load, f128mem, 0,
6462 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6463 let ExeDomain = SSEPackedDouble in
6464 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6465 VR128, load, f128mem, 0,
6466 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6467 let ExeDomain = SSEPackedSingle in
6468 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6469 VR256, load, i256mem, 0,
6470 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6473 let Predicates = [HasAVX2] in {
6474 let isCommutable = 0 in {
6475 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6476 VR256, load, i256mem, 0,
6477 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6481 let Constraints = "$src1 = $dst" in {
6482 let isCommutable = 0 in {
6483 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6484 VR128, memop, i128mem, 1,
6485 SchedWriteMPSAD.XMM>;
6488 let ExeDomain = SSEPackedSingle in
6489 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6490 VR128, memop, f128mem, 1,
6491 SchedWriteDPPS.XMM>;
6492 let ExeDomain = SSEPackedDouble in
6493 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6494 VR128, memop, f128mem, 1,
6495 SchedWriteDPPD.XMM>;
6498 /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6499 multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6500 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6501 X86MemOperand x86memop, bit Is2Addr, Domain d,
6502 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6503 let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6504 let isCommutable = 1 in
6505 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6506 (ins RC:$src1, RC:$src2, u8imm:$src3),
6508 !strconcat(OpcodeStr,
6509 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6510 !strconcat(OpcodeStr,
6511 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6512 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
6514 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6515 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6517 !strconcat(OpcodeStr,
6518 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6519 !strconcat(OpcodeStr,
6520 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6522 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
6523 Sched<[sched.Folded, sched.ReadAfterFold]>;
6526 // Pattern to commute if load is in first source.
6527 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
6528 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6529 (commuteXForm imm:$src3))>;
6532 let Predicates = [HasAVX] in {
6533 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6534 VR128, load, f128mem, 0, SSEPackedSingle,
6535 SchedWriteFBlend.XMM, BlendCommuteImm4>,
6537 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6538 VR256, load, f256mem, 0, SSEPackedSingle,
6539 SchedWriteFBlend.YMM, BlendCommuteImm8>,
6540 VEX_4V, VEX_L, VEX_WIG;
6541 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6542 VR128, load, f128mem, 0, SSEPackedDouble,
6543 SchedWriteFBlend.XMM, BlendCommuteImm2>,
6545 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6546 VR256, load, f256mem, 0, SSEPackedDouble,
6547 SchedWriteFBlend.YMM, BlendCommuteImm4>,
6548 VEX_4V, VEX_L, VEX_WIG;
6549 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6550 VR128, load, i128mem, 0, SSEPackedInt,
6551 SchedWriteBlend.XMM, BlendCommuteImm8>,
6555 let Predicates = [HasAVX2] in {
6556 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6557 VR256, load, i256mem, 0, SSEPackedInt,
6558 SchedWriteBlend.YMM, BlendCommuteImm8>,
6559 VEX_4V, VEX_L, VEX_WIG;
6562 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6563 VR128, memop, f128mem, 1, SSEPackedSingle,
6564 SchedWriteFBlend.XMM, BlendCommuteImm4>;
6565 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6566 VR128, memop, f128mem, 1, SSEPackedDouble,
6567 SchedWriteFBlend.XMM, BlendCommuteImm2>;
6568 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6569 VR128, memop, i128mem, 1, SSEPackedInt,
6570 SchedWriteBlend.XMM, BlendCommuteImm8>;
6572 // For insertion into the zero index (low half) of a 256-bit vector, it is
6573 // more efficient to generate a blend with immediate instead of an insert*128.
6574 let Predicates = [HasAVX] in {
6575 def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6576 (VBLENDPDYrri VR256:$src1,
6577 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6578 VR128:$src2, sub_xmm), 0x3)>;
6579 def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6580 (VBLENDPSYrri VR256:$src1,
6581 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6582 VR128:$src2, sub_xmm), 0xf)>;
6585 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
6586 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
6587 RegisterClass RC, X86MemOperand x86memop,
6588 PatFrag mem_frag, Intrinsic IntId,
6589 X86FoldableSchedWrite sched> {
6590 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6591 (ins RC:$src1, RC:$src2, RC:$src3),
6592 !strconcat(OpcodeStr,
6593 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6594 [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
6595 SSEPackedInt>, TAPD, VEX_4V,
6598 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6599 (ins RC:$src1, x86memop:$src2, RC:$src3),
6600 !strconcat(OpcodeStr,
6601 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6603 (IntId RC:$src1, (mem_frag addr:$src2),
6604 RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
6605 Sched<[sched.Folded, sched.ReadAfterFold,
6607 ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6610 sched.ReadAfterFold]>;
6613 let Predicates = [HasAVX] in {
6614 let ExeDomain = SSEPackedDouble in {
6615 defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
6616 load, int_x86_sse41_blendvpd,
6617 SchedWriteFVarBlend.XMM>;
6618 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
6619 loadv4f64, int_x86_avx_blendv_pd_256,
6620 SchedWriteFVarBlend.YMM>, VEX_L;
6621 } // ExeDomain = SSEPackedDouble
6622 let ExeDomain = SSEPackedSingle in {
6623 defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
6624 load, int_x86_sse41_blendvps,
6625 SchedWriteFVarBlend.XMM>;
6626 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
6627 loadv8f32, int_x86_avx_blendv_ps_256,
6628 SchedWriteFVarBlend.YMM>, VEX_L;
6629 } // ExeDomain = SSEPackedSingle
6630 defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
6631 load, int_x86_sse41_pblendvb,
6632 SchedWriteVarBlend.XMM>;
6635 let Predicates = [HasAVX2] in {
6636 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
6637 load, int_x86_avx2_pblendvb,
6638 SchedWriteVarBlend.YMM>, VEX_L;
6641 let Predicates = [HasAVX] in {
6642 def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
6643 (v16i8 VR128:$src2))),
6644 (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6645 def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6646 (v4i32 VR128:$src2))),
6647 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6648 def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
6649 (v4f32 VR128:$src2))),
6650 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6651 def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6652 (v2i64 VR128:$src2))),
6653 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6654 def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
6655 (v2f64 VR128:$src2))),
6656 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6657 def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6658 (v8i32 VR256:$src2))),
6659 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6660 def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
6661 (v8f32 VR256:$src2))),
6662 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6663 def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6664 (v4i64 VR256:$src2))),
6665 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6666 def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
6667 (v4f64 VR256:$src2))),
6668 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6671 let Predicates = [HasAVX2] in {
6672 def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
6673 (v32i8 VR256:$src2))),
6674 (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6677 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6678 // changed to use blends because blends have better throughput on sandybridge
6679 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6680 let Predicates = [HasAVX, OptForSpeed] in {
6681 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6682 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6683 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6684 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6686 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6687 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6688 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6689 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6690 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6691 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6693 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6694 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6695 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6696 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6697 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6698 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6700 // Move low f32 and clear high bits.
6701 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6702 (SUBREG_TO_REG (i32 0),
6703 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6704 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6705 (i8 1))), sub_xmm)>;
6706 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6707 (SUBREG_TO_REG (i32 0),
6708 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6709 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6710 (i8 3))), sub_xmm)>;
6712 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
6713 (SUBREG_TO_REG (i32 0),
6714 (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
6715 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
6716 (i8 1))), sub_xmm)>;
6717 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
6718 (SUBREG_TO_REG (i32 0),
6719 (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
6720 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
6721 (i8 0xf))), sub_xmm)>;
6724 // Prefer a movss or movsd over a blendps when optimizing for size. these were
6725 // changed to use blends because blends have better throughput on sandybridge
6726 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6727 let Predicates = [UseSSE41, OptForSpeed] in {
6728 // With SSE41 we can use blends for these patterns.
6729 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6730 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6731 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6732 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6734 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6735 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6736 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6737 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6738 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6739 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6741 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6742 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6743 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6744 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6745 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6746 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6750 /// SS41I_ternary_int - SSE 4.1 ternary operator
6751 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6752 multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
6753 X86MemOperand x86memop, Intrinsic IntId,
6754 X86FoldableSchedWrite sched> {
6755 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6756 (ins VR128:$src1, VR128:$src2),
6757 !strconcat(OpcodeStr,
6758 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6759 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
6762 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6763 (ins VR128:$src1, x86memop:$src2),
6764 !strconcat(OpcodeStr,
6765 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6768 (mem_frag addr:$src2), XMM0))]>,
6769 Sched<[sched.Folded, sched.ReadAfterFold]>;
6773 let ExeDomain = SSEPackedDouble in
6774 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem,
6775 int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
6776 let ExeDomain = SSEPackedSingle in
6777 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem,
6778 int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
6779 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem,
6780 int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
6782 // Aliases with the implicit xmm0 argument
6783 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6784 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6785 def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6786 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6787 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6788 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6789 def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6790 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6791 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6792 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6793 def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6794 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6796 let Predicates = [UseSSE41] in {
6797 def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
6798 (v16i8 VR128:$src2))),
6799 (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
6800 def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
6801 (v4i32 VR128:$src2))),
6802 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6803 def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
6804 (v4f32 VR128:$src2))),
6805 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6806 def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
6807 (v2i64 VR128:$src2))),
6808 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6809 def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
6810 (v2f64 VR128:$src2))),
6811 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6814 let AddedComplexity = 400 in { // Prefer non-temporal versions
6816 let Predicates = [HasAVX, NoVLX] in
6817 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6818 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6819 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6820 let Predicates = [HasAVX2, NoVLX] in
6821 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6822 "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6823 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6824 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6825 "movntdqa\t{$src, $dst|$dst, $src}", []>,
6826 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6828 let Predicates = [HasAVX2, NoVLX] in {
6829 def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6830 (VMOVNTDQAYrm addr:$src)>;
6831 def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6832 (VMOVNTDQAYrm addr:$src)>;
6833 def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6834 (VMOVNTDQAYrm addr:$src)>;
6835 def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6836 (VMOVNTDQAYrm addr:$src)>;
6837 def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6838 (VMOVNTDQAYrm addr:$src)>;
6839 def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6840 (VMOVNTDQAYrm addr:$src)>;
6843 let Predicates = [HasAVX, NoVLX] in {
6844 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6845 (VMOVNTDQArm addr:$src)>;
6846 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6847 (VMOVNTDQArm addr:$src)>;
6848 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6849 (VMOVNTDQArm addr:$src)>;
6850 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6851 (VMOVNTDQArm addr:$src)>;
6852 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6853 (VMOVNTDQArm addr:$src)>;
6854 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6855 (VMOVNTDQArm addr:$src)>;
6858 let Predicates = [UseSSE41] in {
6859 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6860 (MOVNTDQArm addr:$src)>;
6861 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6862 (MOVNTDQArm addr:$src)>;
6863 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6864 (MOVNTDQArm addr:$src)>;
6865 def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6866 (MOVNTDQArm addr:$src)>;
6867 def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6868 (MOVNTDQArm addr:$src)>;
6869 def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6870 (MOVNTDQArm addr:$src)>;
6873 } // AddedComplexity
6875 //===----------------------------------------------------------------------===//
6876 // SSE4.2 - Compare Instructions
6877 //===----------------------------------------------------------------------===//
6879 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
6880 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6881 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6882 X86MemOperand x86memop, X86FoldableSchedWrite sched,
6884 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6885 (ins RC:$src1, RC:$src2),
6887 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6888 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6889 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6891 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6892 (ins RC:$src1, x86memop:$src2),
6894 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6895 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6897 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6898 Sched<[sched.Folded, sched.ReadAfterFold]>;
6901 let Predicates = [HasAVX] in
6902 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6903 load, i128mem, SchedWriteVecALU.XMM, 0>,
6906 let Predicates = [HasAVX2] in
6907 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6908 load, i256mem, SchedWriteVecALU.YMM, 0>,
6909 VEX_4V, VEX_L, VEX_WIG;
6911 let Constraints = "$src1 = $dst" in
6912 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6913 memop, i128mem, SchedWriteVecALU.XMM>;
6915 //===----------------------------------------------------------------------===//
6916 // SSE4.2 - String/text Processing Instructions
6917 //===----------------------------------------------------------------------===//
6919 multiclass pcmpistrm_SS42AI<string asm> {
6920 def rr : SS42AI<0x62, MRMSrcReg, (outs),
6921 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6922 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6923 []>, Sched<[WritePCmpIStrM]>;
6925 def rm :SS42AI<0x62, MRMSrcMem, (outs),
6926 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6927 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6928 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6931 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6932 let Predicates = [HasAVX] in
6933 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6934 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
6937 multiclass SS42AI_pcmpestrm<string asm> {
6938 def rr : SS42AI<0x60, MRMSrcReg, (outs),
6939 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6940 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6941 []>, Sched<[WritePCmpEStrM]>;
6943 def rm : SS42AI<0x60, MRMSrcMem, (outs),
6944 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6945 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6946 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6949 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6950 let Predicates = [HasAVX] in
6951 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6952 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
6955 multiclass SS42AI_pcmpistri<string asm> {
6956 def rr : SS42AI<0x63, MRMSrcReg, (outs),
6957 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6958 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6959 []>, Sched<[WritePCmpIStrI]>;
6961 def rm : SS42AI<0x63, MRMSrcMem, (outs),
6962 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6963 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6964 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6967 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6968 let Predicates = [HasAVX] in
6969 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6970 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
6973 multiclass SS42AI_pcmpestri<string asm> {
6974 def rr : SS42AI<0x61, MRMSrcReg, (outs),
6975 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6976 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6977 []>, Sched<[WritePCmpEStrI]>;
6979 def rm : SS42AI<0x61, MRMSrcMem, (outs),
6980 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6981 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6982 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6985 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6986 let Predicates = [HasAVX] in
6987 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6988 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
6991 //===----------------------------------------------------------------------===//
6992 // SSE4.2 - CRC Instructions
6993 //===----------------------------------------------------------------------===//
6995 // No CRC instructions have AVX equivalents
6997 // crc intrinsic instruction
6998 // This set of instructions are only rm, the only difference is the size
7000 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
7001 RegisterClass RCIn, SDPatternOperator Int> :
7002 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
7003 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7004 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
7005 Sched<[WriteCRC32]>;
7007 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
7008 X86MemOperand x86memop, SDPatternOperator Int> :
7009 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
7010 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7011 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
7012 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
7014 let Constraints = "$src1 = $dst" in {
7015 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
7016 int_x86_sse42_crc32_32_8>;
7017 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
7018 int_x86_sse42_crc32_32_8>;
7019 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
7020 int_x86_sse42_crc32_32_16>, OpSize16;
7021 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
7022 int_x86_sse42_crc32_32_16>, OpSize16;
7023 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
7024 int_x86_sse42_crc32_32_32>, OpSize32;
7025 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
7026 int_x86_sse42_crc32_32_32>, OpSize32;
7027 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
7028 int_x86_sse42_crc32_64_64>, REX_W;
7029 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
7030 int_x86_sse42_crc32_64_64>, REX_W;
7031 let hasSideEffects = 0 in {
7033 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
7035 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
7040 //===----------------------------------------------------------------------===//
7041 // SHA-NI Instructions
7042 //===----------------------------------------------------------------------===//
7044 // FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
7045 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
7046 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
7047 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
7048 (ins VR128:$src1, VR128:$src2),
7050 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
7051 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
7053 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
7054 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
7057 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
7058 (ins VR128:$src1, i128mem:$src2),
7060 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
7061 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
7063 (set VR128:$dst, (IntId VR128:$src1,
7064 (memop addr:$src2), XMM0)),
7065 (set VR128:$dst, (IntId VR128:$src1,
7066 (memop addr:$src2))))]>, T8,
7067 Sched<[sched.Folded, sched.ReadAfterFold]>;
7070 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
7071 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
7072 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7073 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7075 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
7076 (i8 imm:$src3)))]>, TA,
7077 Sched<[SchedWriteVecIMul.XMM]>;
7078 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
7079 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7080 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7082 (int_x86_sha1rnds4 VR128:$src1,
7084 (i8 imm:$src3)))]>, TA,
7085 Sched<[SchedWriteVecIMul.XMM.Folded,
7086 SchedWriteVecIMul.XMM.ReadAfterFold]>;
7088 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
7089 SchedWriteVecIMul.XMM>;
7090 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
7091 SchedWriteVecIMul.XMM>;
7092 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
7093 SchedWriteVecIMul.XMM>;
7096 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
7097 SchedWriteVecIMul.XMM, 1>;
7099 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
7100 SchedWriteVecIMul.XMM>;
7101 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
7102 SchedWriteVecIMul.XMM>;
7105 // Aliases with explicit %xmm0
7106 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
7107 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
7108 def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
7109 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
7111 //===----------------------------------------------------------------------===//
7112 // AES-NI Instructions
7113 //===----------------------------------------------------------------------===//
7115 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
7116 Intrinsic IntId, PatFrag ld_frag,
7117 bit Is2Addr = 0, RegisterClass RC = VR128,
7118 X86MemOperand MemOp = i128mem> {
7119 let AsmString = OpcodeStr##
7120 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
7121 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7122 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
7123 (ins RC:$src1, RC:$src2), "",
7124 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
7125 Sched<[WriteAESDecEnc]>;
7126 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
7127 (ins RC:$src1, MemOp:$src2), "",
7128 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
7129 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
7133 // Perform One Round of an AES Encryption/Decryption Flow
7134 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
7135 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
7136 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
7137 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
7138 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
7139 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
7140 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
7141 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
7142 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
7145 let Predicates = [NoVLX, HasVAES] in {
7146 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
7147 int_x86_aesni_aesenc_256, load, 0, VR256,
7148 i256mem>, VEX_4V, VEX_L, VEX_WIG;
7149 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
7150 int_x86_aesni_aesenclast_256, load, 0, VR256,
7151 i256mem>, VEX_4V, VEX_L, VEX_WIG;
7152 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
7153 int_x86_aesni_aesdec_256, load, 0, VR256,
7154 i256mem>, VEX_4V, VEX_L, VEX_WIG;
7155 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
7156 int_x86_aesni_aesdeclast_256, load, 0, VR256,
7157 i256mem>, VEX_4V, VEX_L, VEX_WIG;
7160 let Constraints = "$src1 = $dst" in {
7161 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
7162 int_x86_aesni_aesenc, memop, 1>;
7163 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
7164 int_x86_aesni_aesenclast, memop, 1>;
7165 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
7166 int_x86_aesni_aesdec, memop, 1>;
7167 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
7168 int_x86_aesni_aesdeclast, memop, 1>;
7171 // Perform the AES InvMixColumn Transformation
7172 let Predicates = [HasAVX, HasAES] in {
7173 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7175 "vaesimc\t{$src1, $dst|$dst, $src1}",
7177 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
7179 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7180 (ins i128mem:$src1),
7181 "vaesimc\t{$src1, $dst|$dst, $src1}",
7182 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
7183 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
7185 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7187 "aesimc\t{$src1, $dst|$dst, $src1}",
7189 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
7190 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7191 (ins i128mem:$src1),
7192 "aesimc\t{$src1, $dst|$dst, $src1}",
7193 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
7194 Sched<[WriteAESIMC.Folded]>;
7196 // AES Round Key Generation Assist
7197 let Predicates = [HasAVX, HasAES] in {
7198 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7199 (ins VR128:$src1, u8imm:$src2),
7200 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7202 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7203 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
7204 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7205 (ins i128mem:$src1, u8imm:$src2),
7206 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7208 (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
7209 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
7211 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7212 (ins VR128:$src1, u8imm:$src2),
7213 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7215 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7216 Sched<[WriteAESKeyGen]>;
7217 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7218 (ins i128mem:$src1, u8imm:$src2),
7219 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7221 (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
7222 Sched<[WriteAESKeyGen.Folded]>;
7224 //===----------------------------------------------------------------------===//
7225 // PCLMUL Instructions
7226 //===----------------------------------------------------------------------===//
7228 // Immediate transform to help with commuting.
7229 def PCLMULCommuteImm : SDNodeXForm<imm, [{
7230 uint8_t Imm = N->getZExtValue();
7231 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
7234 // SSE carry-less Multiplication instructions
7235 let Predicates = [NoAVX, HasPCLMUL] in {
7236 let Constraints = "$src1 = $dst" in {
7237 let isCommutable = 1 in
7238 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7239 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7240 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7242 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
7243 Sched<[WriteCLMul]>;
7245 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7246 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7247 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7249 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
7251 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
7252 } // Constraints = "$src1 = $dst"
7254 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
7256 (PCLMULQDQrm VR128:$src1, addr:$src2,
7257 (PCLMULCommuteImm imm:$src3))>;
7258 } // Predicates = [NoAVX, HasPCLMUL]
7261 foreach HI = ["hq","lq"] in
7262 foreach LO = ["hq","lq"] in {
7263 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
7264 (PCLMULQDQrr VR128:$dst, VR128:$src,
7265 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
7266 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
7267 (PCLMULQDQrm VR128:$dst, i128mem:$src,
7268 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
7271 // AVX carry-less Multiplication instructions
7272 multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
7273 PatFrag LdFrag, Intrinsic IntId> {
7274 let isCommutable = 1 in
7275 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
7276 (ins RC:$src1, RC:$src2, u8imm:$src3),
7277 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7279 (IntId RC:$src1, RC:$src2, imm:$src3))]>,
7280 Sched<[WriteCLMul]>;
7282 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
7283 (ins RC:$src1, MemOp:$src2, u8imm:$src3),
7284 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7286 (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
7287 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
7289 // We can commute a load in the first operand by swapping the sources and
7290 // rotating the immediate.
7291 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
7292 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
7293 (PCLMULCommuteImm imm:$src3))>;
7296 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
7297 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
7298 int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
7300 let Predicates = [NoVLX, HasVPCLMULQDQ] in
7301 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
7302 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
7304 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
7305 X86MemOperand MemOp, string Hi, string Lo> {
7306 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7307 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
7308 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
7309 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7310 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
7311 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
7314 multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
7315 X86MemOperand MemOp> {
7316 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
7317 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
7318 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
7319 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
7323 defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
7324 defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
7326 //===----------------------------------------------------------------------===//
7327 // SSE4A Instructions
7328 //===----------------------------------------------------------------------===//
7330 let Predicates = [HasSSE4A] in {
7332 let ExeDomain = SSEPackedInt in {
7333 let Constraints = "$src = $dst" in {
7334 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
7335 (ins VR128:$src, u8imm:$len, u8imm:$idx),
7336 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7337 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
7339 PD, Sched<[SchedWriteVecALU.XMM]>;
7340 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
7341 (ins VR128:$src, VR128:$mask),
7342 "extrq\t{$mask, $src|$src, $mask}",
7343 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7345 PD, Sched<[SchedWriteVecALU.XMM]>;
7347 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7348 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7349 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7350 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7351 imm:$len, imm:$idx))]>,
7352 XD, Sched<[SchedWriteVecALU.XMM]>;
7353 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
7354 (ins VR128:$src, VR128:$mask),
7355 "insertq\t{$mask, $src|$src, $mask}",
7356 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7358 XD, Sched<[SchedWriteVecALU.XMM]>;
7360 } // ExeDomain = SSEPackedInt
7362 // Non-temporal (unaligned) scalar stores.
7363 let AddedComplexity = 400 in { // Prefer non-temporal versions
7364 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
7365 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7366 "movntss\t{$src, $dst|$dst, $src}", []>, XS;
7368 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7369 "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
7372 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
7373 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7375 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
7376 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7378 } // AddedComplexity
7381 //===----------------------------------------------------------------------===//
7383 //===----------------------------------------------------------------------===//
7385 //===----------------------------------------------------------------------===//
7386 // VBROADCAST - Load from memory and broadcast to all elements of the
7387 // destination operand
7389 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7390 X86MemOperand x86memop, ValueType VT,
7391 PatFrag ld_frag, SchedWrite Sched> :
7392 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7393 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7394 [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
7395 Sched<[Sched]>, VEX;
7397 // AVX2 adds register forms
7398 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7399 ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7400 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7401 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7402 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7403 Sched<[Sched]>, VEX;
7405 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
7406 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7407 f32mem, v4f32, loadf32,
7408 SchedWriteFShuffle.XMM.Folded>;
7409 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7410 f32mem, v8f32, loadf32,
7411 SchedWriteFShuffle.XMM.Folded>, VEX_L;
7413 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7414 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7416 SchedWriteFShuffle.XMM.Folded>, VEX_L;
7418 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7419 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7420 v4f32, v4f32, SchedWriteFShuffle.XMM>;
7421 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7422 v8f32, v4f32, WriteFShuffle256>, VEX_L;
7424 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7425 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7426 v4f64, v2f64, WriteFShuffle256>, VEX_L;
7428 let Predicates = [HasAVX, NoVLX] in {
7429 def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
7430 (VBROADCASTSSrm addr:$src)>;
7431 def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
7432 (VBROADCASTSSYrm addr:$src)>;
7433 def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
7434 (VBROADCASTSDYrm addr:$src)>;
7437 //===----------------------------------------------------------------------===//
7438 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7439 // halves of a 256-bit vector.
7441 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7442 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7444 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7445 Sched<[WriteShuffleLd]>, VEX, VEX_L;
7447 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7448 ExeDomain = SSEPackedSingle in
7449 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7451 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7452 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7454 let Predicates = [HasAVX2, NoVLX] in {
7455 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7456 (VBROADCASTI128 addr:$src)>;
7457 def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
7458 (VBROADCASTI128 addr:$src)>;
7459 def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
7460 (VBROADCASTI128 addr:$src)>;
7461 def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
7462 (VBROADCASTI128 addr:$src)>;
7465 let Predicates = [HasAVX, NoVLX] in {
7466 def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
7467 (VBROADCASTF128 addr:$src)>;
7468 def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
7469 (VBROADCASTF128 addr:$src)>;
7472 let Predicates = [HasAVX1Only] in {
7473 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7474 (VBROADCASTF128 addr:$src)>;
7475 def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
7476 (VBROADCASTF128 addr:$src)>;
7477 def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
7478 (VBROADCASTF128 addr:$src)>;
7479 def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
7480 (VBROADCASTF128 addr:$src)>;
7483 //===----------------------------------------------------------------------===//
7484 // VINSERTF128 - Insert packed floating-point values
7486 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7487 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7488 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7489 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7490 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7492 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7493 (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7494 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7495 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7498 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
7499 // with YMM register containing zero.
7500 // FIXME: Avoid producing vxorps to clear the fake inputs.
7501 let Predicates = [HasAVX1Only] in {
7502 def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7505 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7506 PatFrag memop_frag> {
7507 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7509 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7510 (INSERT_get_vinsert128_imm VR256:$ins))>;
7511 def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7512 (From (memop_frag addr:$src2)),
7514 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7515 (INSERT_get_vinsert128_imm VR256:$ins))>;
7518 let Predicates = [HasAVX, NoVLX] in {
7519 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7520 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7523 let Predicates = [HasAVX1Only] in {
7524 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
7525 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>;
7526 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
7527 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>;
7530 //===----------------------------------------------------------------------===//
7531 // VEXTRACTF128 - Extract packed floating-point values
7533 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7534 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7535 (ins VR256:$src1, u8imm:$src2),
7536 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7537 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7539 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7540 (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7541 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7542 []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7545 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7546 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7547 (To (!cast<Instruction>(InstrStr#rr)
7549 (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7550 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7551 (iPTR imm))), addr:$dst),
7552 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7553 (EXTRACT_get_vextract128_imm VR128:$ext))>;
7557 let Predicates = [HasAVX, NoVLX] in {
7558 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7559 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7562 let Predicates = [HasAVX1Only] in {
7563 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
7564 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
7565 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7566 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
7569 //===----------------------------------------------------------------------===//
7570 // VMASKMOV - Conditional SIMD Packed Loads and Stores
7572 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7573 Intrinsic IntLd, Intrinsic IntLd256,
7574 Intrinsic IntSt, Intrinsic IntSt256> {
7575 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7576 (ins VR128:$src1, f128mem:$src2),
7577 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7578 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7579 VEX_4V, Sched<[WriteFMaskedLoad]>;
7580 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7581 (ins VR256:$src1, f256mem:$src2),
7582 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7583 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7584 VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
7585 def mr : AVX8I<opc_mr, MRMDestMem, (outs),
7586 (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7587 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7588 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7589 VEX_4V, Sched<[WriteFMaskedStore]>;
7590 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7591 (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7592 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7593 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7594 VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
7597 let ExeDomain = SSEPackedSingle in
7598 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7599 int_x86_avx_maskload_ps,
7600 int_x86_avx_maskload_ps_256,
7601 int_x86_avx_maskstore_ps,
7602 int_x86_avx_maskstore_ps_256>;
7603 let ExeDomain = SSEPackedDouble in
7604 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7605 int_x86_avx_maskload_pd,
7606 int_x86_avx_maskload_pd_256,
7607 int_x86_avx_maskstore_pd,
7608 int_x86_avx_maskstore_pd_256>;
7610 //===----------------------------------------------------------------------===//
7611 // VPERMIL - Permute Single and Double Floating-Point Values
7614 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7615 RegisterClass RC, X86MemOperand x86memop_f,
7616 X86MemOperand x86memop_i,
7617 ValueType f_vt, ValueType i_vt,
7618 X86FoldableSchedWrite sched,
7619 X86FoldableSchedWrite varsched> {
7620 let Predicates = [HasAVX, NoVLX] in {
7621 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7622 (ins RC:$src1, RC:$src2),
7623 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7624 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7626 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7627 (ins RC:$src1, x86memop_i:$src2),
7628 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7629 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7630 (i_vt (load addr:$src2)))))]>, VEX_4V,
7631 Sched<[varsched.Folded, sched.ReadAfterFold]>;
7633 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7634 (ins RC:$src1, u8imm:$src2),
7635 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7636 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
7638 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7639 (ins x86memop_f:$src1, u8imm:$src2),
7640 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7642 (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
7643 Sched<[sched.Folded]>;
7644 }// Predicates = [HasAVX, NoVLX]
7647 let ExeDomain = SSEPackedSingle in {
7648 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7649 v4f32, v4i32, SchedWriteFShuffle.XMM,
7650 SchedWriteFVarShuffle.XMM>;
7651 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7652 v8f32, v8i32, SchedWriteFShuffle.YMM,
7653 SchedWriteFVarShuffle.YMM>, VEX_L;
7655 let ExeDomain = SSEPackedDouble in {
7656 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7657 v2f64, v2i64, SchedWriteFShuffle.XMM,
7658 SchedWriteFVarShuffle.XMM>;
7659 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7660 v4f64, v4i64, SchedWriteFShuffle.YMM,
7661 SchedWriteFVarShuffle.YMM>, VEX_L;
7664 //===----------------------------------------------------------------------===//
7665 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7668 let ExeDomain = SSEPackedSingle in {
7669 let isCommutable = 1 in
7670 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7671 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7672 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7673 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7674 (i8 imm:$src3))))]>, VEX_4V, VEX_L,
7675 Sched<[WriteFShuffle256]>;
7676 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7677 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7678 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7679 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7680 (i8 imm:$src3)))]>, VEX_4V, VEX_L,
7681 Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7684 // Immediate transform to help with commuting.
7685 def Perm2XCommuteImm : SDNodeXForm<imm, [{
7686 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7689 let Predicates = [HasAVX] in {
7690 // Pattern with load in other operand.
7691 def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7692 VR256:$src1, (i8 imm:$imm))),
7693 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7696 let Predicates = [HasAVX1Only] in {
7697 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7698 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7699 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7700 (loadv4i64 addr:$src2), (i8 imm:$imm))),
7701 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7702 // Pattern with load in other operand.
7703 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7704 VR256:$src1, (i8 imm:$imm))),
7705 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7708 //===----------------------------------------------------------------------===//
7709 // VZERO - Zero YMM registers
7710 // Note: These instruction do not affect the YMM16-YMM31.
7713 let SchedRW = [WriteSystem] in {
7714 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7715 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7716 // Zero All YMM registers
7717 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7718 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7719 Requires<[HasAVX]>, VEX_WIG;
7721 // Zero Upper bits of YMM registers
7722 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7723 [(int_x86_avx_vzeroupper)]>, PS, VEX,
7724 Requires<[HasAVX]>, VEX_WIG;
7728 //===----------------------------------------------------------------------===//
7729 // Half precision conversion instructions
7732 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7733 X86FoldableSchedWrite sched> {
7734 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7735 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7736 [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
7737 T8PD, VEX, Sched<[sched]>;
7738 let hasSideEffects = 0, mayLoad = 1 in
7739 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7740 "vcvtph2ps\t{$src, $dst|$dst, $src}",
7741 [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
7742 T8PD, VEX, Sched<[sched.Folded]>;
7745 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7746 SchedWrite RR, SchedWrite MR> {
7747 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7748 (ins RC:$src1, i32u8imm:$src2),
7749 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7750 [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
7751 TAPD, VEX, Sched<[RR]>;
7752 let hasSideEffects = 0, mayStore = 1 in
7753 def mr : Ii8<0x1D, MRMDestMem, (outs),
7754 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7755 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7756 TAPD, VEX, Sched<[MR]>;
7759 let Predicates = [HasF16C, NoVLX] in {
7760 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
7761 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
7762 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7764 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7765 WriteCvtPS2PHYSt>, VEX_L;
7767 // Pattern match vcvtph2ps of a scalar i64 load.
7768 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
7769 (VCVTPH2PSrm addr:$src)>;
7770 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
7771 (VCVTPH2PSrm addr:$src)>;
7772 def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
7773 (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
7774 (VCVTPH2PSrm addr:$src)>;
7776 def : Pat<(store (f64 (extractelt
7777 (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7778 (iPTR 0))), addr:$dst),
7779 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7780 def : Pat<(store (i64 (extractelt
7781 (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7782 (iPTR 0))), addr:$dst),
7783 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7784 def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
7785 (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
7788 // Patterns for matching conversions from float to half-float and vice versa.
7789 let Predicates = [HasF16C, NoVLX] in {
7790 // Use MXCSR.RC for rounding instead of explicitly specifying the default
7791 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
7792 // configurations we support (the default). However, falling back to MXCSR is
7793 // more consistent with other instructions, which are always controlled by it.
7794 // It's encoded as 0b100.
7795 def : Pat<(fp_to_f16 FR32:$src),
7796 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
7797 (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
7799 def : Pat<(f16_to_fp GR16:$src),
7800 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7801 (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
7803 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
7804 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7805 (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
7808 //===----------------------------------------------------------------------===//
7809 // AVX2 Instructions
7810 //===----------------------------------------------------------------------===//
7812 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7813 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7814 ValueType OpVT, X86FoldableSchedWrite sched,
7816 X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7817 let isCommutable = 1 in
7818 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7819 (ins RC:$src1, RC:$src2, u8imm:$src3),
7820 !strconcat(OpcodeStr,
7821 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7822 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
7823 Sched<[sched]>, VEX_4V;
7824 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7825 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7826 !strconcat(OpcodeStr,
7827 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7829 (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
7830 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7832 // Pattern to commute if load is in first source.
7833 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
7834 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7835 (commuteXForm imm:$src3))>;
7838 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7839 SchedWriteBlend.XMM, VR128, i128mem,
7841 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7842 SchedWriteBlend.YMM, VR256, i256mem,
7843 BlendCommuteImm8>, VEX_L;
7845 // For insertion into the zero index (low half) of a 256-bit vector, it is
7846 // more efficient to generate a blend with immediate instead of an insert*128.
7847 let Predicates = [HasAVX2] in {
7848 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7849 (VPBLENDDYrri VR256:$src1,
7850 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7851 VR128:$src2, sub_xmm), 0xf)>;
7852 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7853 (VPBLENDDYrri VR256:$src1,
7854 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7855 VR128:$src2, sub_xmm), 0xf)>;
7856 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7857 (VPBLENDDYrri VR256:$src1,
7858 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7859 VR128:$src2, sub_xmm), 0xf)>;
7860 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7861 (VPBLENDDYrri VR256:$src1,
7862 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7863 VR128:$src2, sub_xmm), 0xf)>;
7866 let Predicates = [HasAVX1Only] in {
7867 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7868 (VBLENDPSYrri VR256:$src1,
7869 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7870 VR128:$src2, sub_xmm), 0xf)>;
7871 def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7872 (VBLENDPSYrri VR256:$src1,
7873 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7874 VR128:$src2, sub_xmm), 0xf)>;
7875 def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7876 (VBLENDPSYrri VR256:$src1,
7877 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7878 VR128:$src2, sub_xmm), 0xf)>;
7879 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7880 (VBLENDPSYrri VR256:$src1,
7881 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7882 VR128:$src2, sub_xmm), 0xf)>;
7885 //===----------------------------------------------------------------------===//
7886 // VPBROADCAST - Load from memory and broadcast to all elements of the
7887 // destination operand
7889 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7890 X86MemOperand x86memop, PatFrag ld_frag,
7891 ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7892 let Predicates = [HasAVX2, prd] in {
7893 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7894 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7896 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7897 Sched<[SchedWriteShuffle.XMM]>, VEX;
7898 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7899 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7901 (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
7902 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7903 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7904 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7906 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7907 Sched<[WriteShuffle256]>, VEX, VEX_L;
7908 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7909 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7911 (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
7912 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7914 // Provide aliases for broadcast from the same register class that
7915 // automatically does the extract.
7916 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7917 (!cast<Instruction>(NAME#"Yrr")
7918 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7922 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
7923 v16i8, v32i8, NoVLX_Or_NoBWI>;
7924 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
7925 v8i16, v16i16, NoVLX_Or_NoBWI>;
7926 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
7927 v4i32, v8i32, NoVLX>;
7928 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
7929 v2i64, v4i64, NoVLX>;
7931 let Predicates = [HasAVX2, NoVLX] in {
7932 // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
7933 def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
7934 (VPBROADCASTQrm addr:$src)>;
7935 def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
7936 (VPBROADCASTQYrm addr:$src)>;
7938 def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
7939 (VPBROADCASTDrm addr:$src)>;
7940 def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
7941 (VPBROADCASTDYrm addr:$src)>;
7942 def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
7943 (VPBROADCASTQrm addr:$src)>;
7944 def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
7945 (VPBROADCASTQYrm addr:$src)>;
7947 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7948 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
7949 // This means we'll encounter truncated i32 loads; match that here.
7950 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7951 (VPBROADCASTWrm addr:$src)>;
7952 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7953 (VPBROADCASTWYrm addr:$src)>;
7954 def : Pat<(v8i16 (X86VBroadcast
7955 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7956 (VPBROADCASTWrm addr:$src)>;
7957 def : Pat<(v16i16 (X86VBroadcast
7958 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7959 (VPBROADCASTWYrm addr:$src)>;
7962 let Predicates = [HasAVX2, NoVLX] in {
7963 // Provide aliases for broadcast from the same register class that
7964 // automatically does the extract.
7965 def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
7966 (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
7968 def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
7969 (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
7973 let Predicates = [HasAVX2, NoVLX] in {
7974 // Provide fallback in case the load node that is used in the patterns above
7975 // is used by additional users, which prevents the pattern selection.
7976 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7977 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7978 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7979 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7980 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7981 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7984 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7985 def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7986 (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
7987 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7988 GR8:$src, sub_8bit)),
7990 def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7991 (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
7992 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7993 GR8:$src, sub_8bit)),
7996 def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7997 (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
7998 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7999 GR16:$src, sub_16bit)),
8001 def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
8002 (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
8003 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
8004 GR16:$src, sub_16bit)),
8007 let Predicates = [HasAVX2, NoVLX] in {
8008 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8009 (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
8010 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8011 (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
8012 def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
8013 (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
8014 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8015 (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
8018 // AVX1 broadcast patterns
8019 let Predicates = [HasAVX1Only] in {
8020 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8021 (VBROADCASTSSYrm addr:$src)>;
8022 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8023 (VBROADCASTSDYrm addr:$src)>;
8024 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8025 (VBROADCASTSSrm addr:$src)>;
8028 // Provide fallback in case the load node that is used in the patterns above
8029 // is used by additional users, which prevents the pattern selection.
8030 let Predicates = [HasAVX, NoVLX] in {
8031 // 128bit broadcasts:
8032 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
8033 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
8034 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
8035 (VMOVDDUPrm addr:$src)>;
8037 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
8038 (VMOVDDUPrr VR128:$src)>;
8039 def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
8040 (VMOVDDUPrm addr:$src)>;
8041 def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
8042 (VMOVDDUPrm addr:$src)>;
8045 let Predicates = [HasAVX1Only] in {
8046 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8047 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
8048 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8049 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
8050 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
8051 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
8052 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8053 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
8054 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
8055 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
8057 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8058 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
8059 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8060 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
8061 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
8062 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
8063 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8064 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
8065 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
8066 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
8068 def : Pat<(v2i64 (X86VBroadcast i64:$src)),
8069 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
8070 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
8071 (VMOVDDUPrm addr:$src)>;
8074 //===----------------------------------------------------------------------===//
8075 // VPERM - Permute instructions
8078 multiclass avx2_perm<bits<8> opc, string OpcodeStr,
8079 ValueType OpVT, X86FoldableSchedWrite Sched,
8080 X86MemOperand memOp> {
8081 let Predicates = [HasAVX2, NoVLX] in {
8082 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8083 (ins VR256:$src1, VR256:$src2),
8084 !strconcat(OpcodeStr,
8085 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8087 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
8088 Sched<[Sched]>, VEX_4V, VEX_L;
8089 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8090 (ins VR256:$src1, memOp:$src2),
8091 !strconcat(OpcodeStr,
8092 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8094 (OpVT (X86VPermv VR256:$src1,
8095 (load addr:$src2))))]>,
8096 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
8100 defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
8101 let ExeDomain = SSEPackedSingle in
8102 defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
8104 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8105 ValueType OpVT, X86FoldableSchedWrite Sched,
8106 X86MemOperand memOp> {
8107 let Predicates = [HasAVX2, NoVLX] in {
8108 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
8109 (ins VR256:$src1, u8imm:$src2),
8110 !strconcat(OpcodeStr,
8111 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8113 (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
8114 Sched<[Sched]>, VEX, VEX_L;
8115 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
8116 (ins memOp:$src1, u8imm:$src2),
8117 !strconcat(OpcodeStr,
8118 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8120 (OpVT (X86VPermi (mem_frag addr:$src1),
8121 (i8 imm:$src2))))]>,
8122 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
8126 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
8127 WriteShuffle256, i256mem>, VEX_W;
8128 let ExeDomain = SSEPackedDouble in
8129 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
8130 WriteFShuffle256, f256mem>, VEX_W;
8132 //===----------------------------------------------------------------------===//
8133 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
8135 let isCommutable = 1 in
8136 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
8137 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
8138 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8139 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8140 (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
8142 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
8143 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
8144 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8145 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
8147 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
8149 let Predicates = [HasAVX2] in
8150 def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
8151 VR256:$src1, (i8 imm:$imm))),
8152 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
8155 //===----------------------------------------------------------------------===//
8156 // VINSERTI128 - Insert packed integer values
8158 let hasSideEffects = 0 in {
8159 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
8160 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
8161 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8162 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
8164 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
8165 (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
8166 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8167 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
8170 let Predicates = [HasAVX2, NoVLX] in {
8171 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
8172 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>;
8173 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
8174 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>;
8177 //===----------------------------------------------------------------------===//
8178 // VEXTRACTI128 - Extract packed integer values
8180 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
8181 (ins VR256:$src1, u8imm:$src2),
8182 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8183 Sched<[WriteShuffle256]>, VEX, VEX_L;
8184 let hasSideEffects = 0, mayStore = 1 in
8185 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
8186 (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
8187 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8188 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
8190 let Predicates = [HasAVX2, NoVLX] in {
8191 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
8192 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
8193 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
8194 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
8197 //===----------------------------------------------------------------------===//
8198 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
8200 multiclass avx2_pmovmask<string OpcodeStr,
8201 Intrinsic IntLd128, Intrinsic IntLd256,
8202 Intrinsic IntSt128, Intrinsic IntSt256> {
8203 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
8204 (ins VR128:$src1, i128mem:$src2),
8205 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8206 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
8207 VEX_4V, Sched<[WriteVecMaskedLoad]>;
8208 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
8209 (ins VR256:$src1, i256mem:$src2),
8210 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8211 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8212 VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
8213 def mr : AVX28I<0x8e, MRMDestMem, (outs),
8214 (ins i128mem:$dst, VR128:$src1, VR128:$src2),
8215 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8216 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
8217 VEX_4V, Sched<[WriteVecMaskedStore]>;
8218 def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
8219 (ins i256mem:$dst, VR256:$src1, VR256:$src2),
8220 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8221 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
8222 VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
8225 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
8226 int_x86_avx2_maskload_d,
8227 int_x86_avx2_maskload_d_256,
8228 int_x86_avx2_maskstore_d,
8229 int_x86_avx2_maskstore_d_256>;
8230 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
8231 int_x86_avx2_maskload_q,
8232 int_x86_avx2_maskload_q_256,
8233 int_x86_avx2_maskstore_q,
8234 int_x86_avx2_maskstore_q_256>, VEX_W;
8236 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
8237 ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
8239 def: Pat<(X86mstore (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
8240 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
8242 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
8243 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
8244 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
8245 (VT (bitconvert (ZeroVT immAllZerosV))))),
8246 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
8247 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
8248 (!cast<Instruction>(BlendStr#"rr")
8250 (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)),
8253 let Predicates = [HasAVX] in {
8254 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
8255 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
8256 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
8257 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
8259 let Predicates = [HasAVX1Only] in {
8260 // load/store i32/i64 not supported use ps/pd version
8261 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
8262 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
8263 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
8264 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
8266 let Predicates = [HasAVX2] in {
8267 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
8268 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
8269 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
8270 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
8273 //===----------------------------------------------------------------------===//
8274 // SubVector Broadcasts
8275 // Provide fallback in case the load node that is used in the patterns above
8276 // is used by additional users, which prevents the pattern selection.
8278 let Predicates = [HasAVX2, NoVLX] in {
8279 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
8280 (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8281 (v2i64 VR128:$src), 1)>;
8282 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
8283 (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8284 (v4i32 VR128:$src), 1)>;
8285 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
8286 (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8287 (v8i16 VR128:$src), 1)>;
8288 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
8289 (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8290 (v16i8 VR128:$src), 1)>;
8293 let Predicates = [HasAVX, NoVLX] in {
8294 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
8295 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8296 (v2f64 VR128:$src), 1)>;
8297 def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
8298 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8299 (v4f32 VR128:$src), 1)>;
8302 let Predicates = [HasAVX1Only] in {
8303 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
8304 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8305 (v2i64 VR128:$src), 1)>;
8306 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
8307 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8308 (v4i32 VR128:$src), 1)>;
8309 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
8310 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8311 (v8i16 VR128:$src), 1)>;
8312 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
8313 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8314 (v16i8 VR128:$src), 1)>;
8317 //===----------------------------------------------------------------------===//
8318 // Variable Bit Shifts
8320 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8321 SDNode IntrinNode, ValueType vt128, ValueType vt256> {
8322 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8323 (ins VR128:$src1, VR128:$src2),
8324 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8326 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8327 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
8328 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8329 (ins VR128:$src1, i128mem:$src2),
8330 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8332 (vt128 (OpNode VR128:$src1,
8333 (vt128 (load addr:$src2)))))]>,
8334 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
8335 SchedWriteVarVecShift.XMM.ReadAfterFold]>;
8336 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8337 (ins VR256:$src1, VR256:$src2),
8338 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8340 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8341 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
8342 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8343 (ins VR256:$src1, i256mem:$src2),
8344 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8346 (vt256 (OpNode VR256:$src1,
8347 (vt256 (load addr:$src2)))))]>,
8348 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
8349 SchedWriteVarVecShift.YMM.ReadAfterFold]>;
8351 def : Pat<(vt128 (IntrinNode VR128:$src1, VR128:$src2)),
8352 (!cast<Instruction>(NAME#"rr") VR128:$src1, VR128:$src2)>;
8353 def : Pat<(vt128 (IntrinNode VR128:$src1, (load addr:$src2))),
8354 (!cast<Instruction>(NAME#"rm") VR128:$src1, addr:$src2)>;
8355 def : Pat<(vt256 (IntrinNode VR256:$src1, VR256:$src2)),
8356 (!cast<Instruction>(NAME#"Yrr") VR256:$src1, VR256:$src2)>;
8357 def : Pat<(vt256 (IntrinNode VR256:$src1, (load addr:$src2))),
8358 (!cast<Instruction>(NAME#"Yrm") VR256:$src1, addr:$src2)>;
8361 let Predicates = [HasAVX2, NoVLX] in {
8362 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, X86vshlv, v4i32, v8i32>;
8363 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, X86vshlv, v2i64, v4i64>, VEX_W;
8364 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, X86vsrlv, v4i32, v8i32>;
8365 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, X86vsrlv, v2i64, v4i64>, VEX_W;
8366 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, X86vsrav, v4i32, v8i32>;
8369 //===----------------------------------------------------------------------===//
8370 // VGATHER - GATHER Operations
8372 // FIXME: Improve scheduling of gather instructions.
8373 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
8374 ValueType VTy, PatFrag GatherNode128,
8375 PatFrag GatherNode256, RegisterClass RC256,
8376 X86MemOperand memop128, X86MemOperand memop256,
8377 ValueType MTx = VTx, ValueType MTy = VTy> {
8378 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
8379 (ins VR128:$src1, memop128:$src2, VR128:$mask),
8380 !strconcat(OpcodeStr,
8381 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8382 [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
8383 (GatherNode128 VR128:$src1, VR128:$mask,
8384 vectoraddr:$src2))]>,
8385 VEX, Sched<[WriteLoad]>;
8386 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
8387 (ins RC256:$src1, memop256:$src2, RC256:$mask),
8388 !strconcat(OpcodeStr,
8389 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8390 [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
8391 (GatherNode256 RC256:$src1, RC256:$mask,
8392 vectoraddr:$src2))]>,
8393 VEX, VEX_L, Sched<[WriteLoad]>;
8396 let Predicates = [UseAVX2] in {
8397 let mayLoad = 1, hasSideEffects = 0, Constraints
8398 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8400 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
8401 mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
8402 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
8403 mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
8404 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
8405 mgatherv8i32, VR256, vx128mem, vy256mem>;
8406 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
8407 mgatherv4i64, VR128, vx64mem, vy128mem>;
8409 let ExeDomain = SSEPackedDouble in {
8410 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
8411 mgatherv4i32, VR256, vx128mem, vx256mem,
8412 v2i64, v4i64>, VEX_W;
8413 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
8414 mgatherv4i64, VR256, vx128mem, vy256mem,
8415 v2i64, v4i64>, VEX_W;
8418 let ExeDomain = SSEPackedSingle in {
8419 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
8420 mgatherv8i32, VR256, vx128mem, vy256mem,
8422 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
8423 mgatherv4i64, VR128, vx64mem, vy128mem,
8429 //===----------------------------------------------------------------------===//
8430 // GFNI instructions
8431 //===----------------------------------------------------------------------===//
8433 multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
8434 RegisterClass RC, PatFrag MemOpFrag,
8435 X86MemOperand X86MemOp, bit Is2Addr = 0> {
8436 let ExeDomain = SSEPackedInt,
8437 AsmString = !if(Is2Addr,
8438 OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
8439 OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
8440 let isCommutable = 1 in
8441 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
8442 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
8443 Sched<[SchedWriteVecALU.XMM]>, T8PD;
8445 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
8446 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
8447 (MemOpFrag addr:$src2))))]>,
8448 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
8452 multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
8453 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
8454 X86MemOperand X86MemOp, bit Is2Addr = 0> {
8455 let AsmString = !if(Is2Addr,
8456 OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8457 OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
8458 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
8459 (ins RC:$src1, RC:$src2, u8imm:$src3), "",
8460 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
8461 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
8462 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
8463 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
8464 [(set RC:$dst, (OpVT (OpNode RC:$src1,
8465 (MemOpFrag addr:$src2),
8466 imm:$src3)))], SSEPackedInt>,
8467 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
8471 multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8472 let Constraints = "$src1 = $dst",
8473 Predicates = [HasGFNI, UseSSE2] in
8474 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8475 VR128, load, i128mem, 1>;
8476 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8477 defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
8478 load, i128mem>, VEX_4V, VEX_W;
8479 defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
8480 load, i256mem>, VEX_4V, VEX_L, VEX_W;
8485 let Constraints = "$src1 = $dst",
8486 Predicates = [HasGFNI, UseSSE2] in
8487 defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8489 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8490 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8492 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8493 i256mem>, VEX_4V, VEX_L;
8495 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
8496 let isCommutable = 0 in {
8497 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8498 X86GF2P8affineinvqb>, TAPD;
8499 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8500 X86GF2P8affineqb>, TAPD;