1 //===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file describes the X86 SSE instruction set, defining the instructions,
11 // and properties of the instructions which are needed for code generation,
12 // machine code emission, and analysis.
14 //===----------------------------------------------------------------------===//
16 class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
17 InstrItinClass rr = arg_rr;
18 InstrItinClass rm = arg_rm;
19 // InstrSchedModel info.
20 X86FoldableSchedWrite Sched = WriteFAdd;
23 class SizeItins<OpndItins arg_s, OpndItins arg_d> {
29 class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
30 InstrItinClass arg_ri> {
31 InstrItinClass rr = arg_rr;
32 InstrItinClass rm = arg_rm;
33 InstrItinClass ri = arg_ri;
37 let Sched = WriteFAdd in {
38 def SSE_ALU_F32S : OpndItins<
39 IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
42 def SSE_ALU_F64S : OpndItins<
43 IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
47 def SSE_ALU_ITINS_S : SizeItins<
48 SSE_ALU_F32S, SSE_ALU_F64S
51 let Sched = WriteFMul in {
52 def SSE_MUL_F32S : OpndItins<
53 IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
56 def SSE_MUL_F64S : OpndItins<
57 IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
61 def SSE_MUL_ITINS_S : SizeItins<
62 SSE_MUL_F32S, SSE_MUL_F64S
65 let Sched = WriteFDiv in {
66 def SSE_DIV_F32S : OpndItins<
67 IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
70 def SSE_DIV_F64S : OpndItins<
71 IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
75 def SSE_DIV_ITINS_S : SizeItins<
76 SSE_DIV_F32S, SSE_DIV_F64S
80 let Sched = WriteFAdd in {
81 def SSE_ALU_F32P : OpndItins<
82 IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
85 def SSE_ALU_F64P : OpndItins<
86 IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
90 def SSE_ALU_ITINS_P : SizeItins<
91 SSE_ALU_F32P, SSE_ALU_F64P
94 let Sched = WriteFMul in {
95 def SSE_MUL_F32P : OpndItins<
96 IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
99 def SSE_MUL_F64P : OpndItins<
100 IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
104 def SSE_MUL_ITINS_P : SizeItins<
105 SSE_MUL_F32P, SSE_MUL_F64P
108 let Sched = WriteFDiv in {
109 def SSE_DIV_F32P : OpndItins<
110 IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
113 def SSE_DIV_F64P : OpndItins<
114 IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
118 def SSE_DIV_ITINS_P : SizeItins<
119 SSE_DIV_F32P, SSE_DIV_F64P
122 let Sched = WriteVecLogic in
123 def SSE_VEC_BIT_ITINS_P : OpndItins<
124 IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
127 def SSE_BIT_ITINS_P : OpndItins<
128 IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
131 let Sched = WriteVecALU in {
132 def SSE_INTALU_ITINS_P : OpndItins<
133 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
136 def SSE_INTALUQ_ITINS_P : OpndItins<
137 IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
141 let Sched = WriteVecIMul in
142 def SSE_INTMUL_ITINS_P : OpndItins<
143 IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
146 def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
147 IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
150 def SSE_MOVA_ITINS : OpndItins<
151 IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
154 def SSE_MOVU_ITINS : OpndItins<
155 IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
158 def SSE_DPPD_ITINS : OpndItins<
159 IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
162 def SSE_DPPS_ITINS : OpndItins<
163 IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
166 def DEFAULT_ITINS : OpndItins<
167 IIC_ALU_NONMEM, IIC_ALU_MEM
170 def SSE_EXTRACT_ITINS : OpndItins<
171 IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
174 def SSE_INSERT_ITINS : OpndItins<
175 IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
178 let Sched = WriteMPSAD in
179 def SSE_MPSADBW_ITINS : OpndItins<
180 IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
183 let Sched = WriteVecIMul in
184 def SSE_PMULLD_ITINS : OpndItins<
185 IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
188 // Definitions for backward compatibility.
189 // The instructions mapped on these definitions uses a different itinerary
190 // than the actual scheduling model.
191 let Sched = WriteShuffle in
192 def DEFAULT_ITINS_SHUFFLESCHED : OpndItins<
193 IIC_ALU_NONMEM, IIC_ALU_MEM
196 let Sched = WriteVecIMul in
197 def DEFAULT_ITINS_VECIMULSCHED : OpndItins<
198 IIC_ALU_NONMEM, IIC_ALU_MEM
201 let Sched = WriteShuffle in
202 def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
203 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
206 let Sched = WriteMPSAD in
207 def DEFAULT_ITINS_MPSADSCHED : OpndItins<
208 IIC_ALU_NONMEM, IIC_ALU_MEM
211 let Sched = WriteFBlend in
212 def DEFAULT_ITINS_FBLENDSCHED : OpndItins<
213 IIC_ALU_NONMEM, IIC_ALU_MEM
216 let Sched = WriteBlend in
217 def DEFAULT_ITINS_BLENDSCHED : OpndItins<
218 IIC_ALU_NONMEM, IIC_ALU_MEM
221 let Sched = WriteVarBlend in
222 def DEFAULT_ITINS_VARBLENDSCHED : OpndItins<
223 IIC_ALU_NONMEM, IIC_ALU_MEM
226 let Sched = WriteFBlend in
227 def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
228 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
231 let Sched = WriteBlend in
232 def SSE_INTALU_ITINS_BLEND_P : OpndItins<
233 IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
236 //===----------------------------------------------------------------------===//
237 // SSE 1 & 2 Instructions Classes
238 //===----------------------------------------------------------------------===//
240 /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
241 multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
242 RegisterClass RC, X86MemOperand x86memop,
243 Domain d, OpndItins itins, bit Is2Addr = 1> {
244 let isCommutable = 1 in {
245 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
247 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
248 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
249 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
250 Sched<[itins.Sched]>;
252 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
254 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
255 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
256 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
257 Sched<[itins.Sched.Folded, ReadAfterLd]>;
260 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
261 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
262 SDPatternOperator Int, RegisterClass RC,
263 string asm, Operand memopr,
264 ComplexPattern mem_cpat, Domain d,
265 OpndItins itins, bit Is2Addr = 1> {
266 let isCodeGenOnly = 1, hasSideEffects = 0 in {
267 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
269 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
270 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
271 [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>,
272 Sched<[itins.Sched]>;
274 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
276 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
277 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
278 [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>,
279 Sched<[itins.Sched.Folded, ReadAfterLd]>;
283 /// sse12_fp_packed - SSE 1 & 2 packed instructions class
284 multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
285 RegisterClass RC, ValueType vt,
286 X86MemOperand x86memop, PatFrag mem_frag,
287 Domain d, OpndItins itins, bit Is2Addr = 1> {
288 let isCommutable = 1 in
289 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
291 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
292 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
293 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
294 Sched<[itins.Sched]>;
296 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
298 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
299 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
300 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
302 Sched<[itins.Sched.Folded, ReadAfterLd]>;
305 /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
306 multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
307 string OpcodeStr, X86MemOperand x86memop,
308 list<dag> pat_rr, list<dag> pat_rm,
310 let isCommutable = 1, hasSideEffects = 0 in
311 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
313 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
314 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
315 pat_rr, NoItinerary, d>,
316 Sched<[WriteVecLogic]>;
317 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
319 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
320 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
321 pat_rm, NoItinerary, d>,
322 Sched<[WriteVecLogicLd, ReadAfterLd]>;
325 //===----------------------------------------------------------------------===//
326 // Non-instruction patterns
327 //===----------------------------------------------------------------------===//
329 // A vector extract of the first f32/f64 position is a subregister copy
330 def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
331 (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
332 def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
333 (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
335 // A 128-bit subvector extract from the first 256-bit vector position
336 // is a subregister copy that needs no instruction.
337 def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
338 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
339 def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
340 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
342 def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
343 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
344 def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
345 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
347 def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
348 (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
349 def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
350 (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
352 // A 128-bit subvector insert to the first 256-bit vector position
353 // is a subregister copy that needs no instruction.
354 let AddedComplexity = 25 in { // to give priority over vinsertf128rm
355 def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
356 (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
357 def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
358 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
359 def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
360 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
361 def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
362 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
363 def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
364 (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
365 def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
366 (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
369 // Implicitly promote a 32-bit scalar to a vector.
370 def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
371 (COPY_TO_REGCLASS FR32:$src, VR128)>;
372 // Implicitly promote a 64-bit scalar to a vector.
373 def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
374 (COPY_TO_REGCLASS FR64:$src, VR128)>;
376 // Bitcasts between 128-bit vector types. Return the original type since
377 // no instruction is needed for the conversion
378 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
379 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
380 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
381 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
382 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
383 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
384 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
385 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
386 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
387 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
388 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
389 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
390 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
391 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
392 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
393 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
394 def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
395 def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
396 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
397 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
398 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
399 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
400 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
401 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
402 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
403 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
404 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
405 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
406 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
407 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
408 def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>;
409 def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>;
411 // Bitcasts between 256-bit vector types. Return the original type since
412 // no instruction is needed for the conversion
413 def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>;
414 def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
415 def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
416 def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
417 def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
418 def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>;
419 def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
420 def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
421 def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>;
422 def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>;
423 def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>;
424 def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>;
425 def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>;
426 def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>;
427 def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>;
428 def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
429 def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
430 def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
431 def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
432 def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
433 def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
434 def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>;
435 def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
436 def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
437 def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
438 def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
439 def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>;
440 def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
441 def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
442 def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
444 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
445 // This is expanded by ExpandPostRAPseudos.
446 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
447 isPseudo = 1, SchedRW = [WriteZero] in {
448 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
449 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>;
450 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
451 [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>;
454 //===----------------------------------------------------------------------===//
455 // AVX & SSE - Zero/One Vectors
456 //===----------------------------------------------------------------------===//
458 // Alias instruction that maps zero vector to pxor / xorp* for sse.
459 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
460 // swizzled by ExecutionDepsFix to pxor.
461 // We set canFoldAsLoad because this can be converted to a constant-pool
462 // load of an all-zeros value if folding it would be beneficial.
463 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
464 isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in {
465 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
466 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
469 let Predicates = [NoVLX] in
470 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
473 // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
474 // and doesn't need it because on sandy bridge the register is set to zero
475 // at the rename stage without using any execution unit, so SET0PSY
476 // and SET0PDY can be used for vector int instructions without penalty
477 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
478 isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in {
479 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
480 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
483 // We set canFoldAsLoad because this can be converted to a constant-pool
484 // load of an all-ones value if folding it would be beneficial.
485 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
486 isPseudo = 1, SchedRW = [WriteZero] in {
487 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
488 [(set VR128:$dst, (v4i32 immAllOnesV))]>;
489 let Predicates = [HasAVX2] in
490 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
491 [(set VR256:$dst, (v8i32 immAllOnesV))]>;
495 //===----------------------------------------------------------------------===//
496 // SSE 1 & 2 - Move FP Scalar Instructions
498 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
499 // register copies because it's a partial register update; Register-to-register
500 // movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
501 // that the insert be implementable in terms of a copy, and just mentioned, we
502 // don't use movss/movsd for copies.
503 //===----------------------------------------------------------------------===//
505 multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
506 X86MemOperand x86memop, string base_opc,
507 string asm_opr, Domain d = GenericDomain> {
508 let isCommutable = 1 in
509 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
510 (ins VR128:$src1, RC:$src2),
511 !strconcat(base_opc, asm_opr),
512 [(set VR128:$dst, (vt (OpNode VR128:$src1,
513 (scalar_to_vector RC:$src2))))],
514 IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
516 // For the disassembler
517 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
518 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
519 (ins VR128:$src1, RC:$src2),
520 !strconcat(base_opc, asm_opr),
521 [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
524 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
525 X86MemOperand x86memop, string OpcodeStr,
526 Domain d = GenericDomain> {
528 defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
529 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
532 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
533 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
534 [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
535 VEX, VEX_LIG, Sched<[WriteStore]>;
537 let Constraints = "$src1 = $dst" in {
538 defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
539 "\t{$src2, $dst|$dst, $src2}", d>;
542 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
543 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
544 [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
548 // Loading from memory automatically zeroing upper bits.
549 multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
550 PatFrag mem_pat, string OpcodeStr,
551 Domain d = GenericDomain> {
552 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
553 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
554 [(set RC:$dst, (mem_pat addr:$src))],
555 IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
556 def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
557 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
558 [(set RC:$dst, (mem_pat addr:$src))],
559 IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
562 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
563 SSEPackedSingle>, XS;
564 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
565 SSEPackedDouble>, XD;
567 let canFoldAsLoad = 1, isReMaterializable = 1 in {
568 defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
569 SSEPackedSingle>, XS;
571 let AddedComplexity = 20 in
572 defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
573 SSEPackedDouble>, XD;
577 let Predicates = [UseAVX] in {
578 let AddedComplexity = 20 in {
579 // MOVSSrm zeros the high parts of the register; represent this
580 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
581 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
582 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
583 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
584 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
585 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
586 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
587 def : Pat<(v4f32 (X86vzload addr:$src)),
588 (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
590 // MOVSDrm zeros the high parts of the register; represent this
591 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
592 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
593 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
594 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
595 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
596 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
597 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
598 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
599 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
600 def : Pat<(v2f64 (X86vzload addr:$src)),
601 (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
603 // Represent the same patterns above but in the form they appear for
605 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
606 (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
607 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
608 def : Pat<(v8f32 (X86vzload addr:$src)),
609 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
610 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
611 (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
612 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
613 def : Pat<(v4f64 (X86vzload addr:$src)),
614 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
617 // Extract and store.
618 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
620 (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
622 // Shuffle with VMOVSS
623 def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
624 (VMOVSSrr (v4i32 VR128:$src1),
625 (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
626 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
627 (VMOVSSrr (v4f32 VR128:$src1),
628 (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
631 def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
632 (SUBREG_TO_REG (i32 0),
633 (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
634 (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
636 def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
637 (SUBREG_TO_REG (i32 0),
638 (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
639 (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
642 // Shuffle with VMOVSD
643 def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
644 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
645 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
646 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
647 def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
648 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
649 def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
650 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
653 def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
654 (SUBREG_TO_REG (i32 0),
655 (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
656 (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
658 def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
659 (SUBREG_TO_REG (i32 0),
660 (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
661 (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
664 // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
665 // is during lowering, where it's not possible to recognize the fold cause
666 // it has two uses through a bitcast. One use disappears at isel time and the
667 // fold opportunity reappears.
668 def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
669 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
670 def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
671 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
672 def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
673 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
674 def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
675 (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
678 let Predicates = [UseSSE1] in {
679 let Predicates = [NoSSE41], AddedComplexity = 15 in {
680 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
681 // MOVSS to the lower bits.
682 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
683 (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
684 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
685 (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
686 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
687 (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
690 let AddedComplexity = 20 in {
691 // MOVSSrm already zeros the high parts of the register.
692 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
693 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
694 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
695 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
696 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
697 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
698 def : Pat<(v4f32 (X86vzload addr:$src)),
699 (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
702 // Extract and store.
703 def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
705 (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
707 // Shuffle with MOVSS
708 def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
709 (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
710 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
711 (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
714 let Predicates = [UseSSE2] in {
715 let Predicates = [NoSSE41], AddedComplexity = 15 in {
716 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
717 // MOVSD to the lower bits.
718 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
719 (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
722 let AddedComplexity = 20 in {
723 // MOVSDrm already zeros the high parts of the register.
724 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
725 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
726 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
727 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
728 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
729 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
730 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
731 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
732 def : Pat<(v2f64 (X86vzload addr:$src)),
733 (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
736 // Shuffle with MOVSD
737 def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
738 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
739 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
740 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
741 def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
742 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
743 def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
744 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
746 // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
747 // is during lowering, where it's not possible to recognize the fold because
748 // it has two uses through a bitcast. One use disappears at isel time and the
749 // fold opportunity reappears.
750 def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
751 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
752 def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
753 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
754 def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
755 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
756 def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
757 (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
760 // Aliases to help the assembler pick two byte VEX encodings by swapping the
761 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
762 def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
763 (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
764 def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
765 (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
767 //===----------------------------------------------------------------------===//
768 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
769 //===----------------------------------------------------------------------===//
771 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
772 X86MemOperand x86memop, PatFrag ld_frag,
773 string asm, Domain d,
775 let hasSideEffects = 0 in
776 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
777 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
778 Sched<[WriteFShuffle]>;
779 let canFoldAsLoad = 1, isReMaterializable = 1 in
780 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
781 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
782 [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
786 let Predicates = [HasAVX, NoVLX] in {
787 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
788 "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
790 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
791 "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
793 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
794 "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
796 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
797 "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
800 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
801 "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
803 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
804 "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
806 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
807 "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
809 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
810 "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
814 let Predicates = [UseSSE1] in {
815 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
816 "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
818 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
819 "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
822 let Predicates = [UseSSE2] in {
823 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
824 "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
826 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
827 "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
831 let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in {
832 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
833 "movaps\t{$src, $dst|$dst, $src}",
834 [(alignedstore (v4f32 VR128:$src), addr:$dst)],
835 IIC_SSE_MOVA_P_MR>, VEX;
836 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
837 "movapd\t{$src, $dst|$dst, $src}",
838 [(alignedstore (v2f64 VR128:$src), addr:$dst)],
839 IIC_SSE_MOVA_P_MR>, VEX;
840 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
841 "movups\t{$src, $dst|$dst, $src}",
842 [(store (v4f32 VR128:$src), addr:$dst)],
843 IIC_SSE_MOVU_P_MR>, VEX;
844 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
845 "movupd\t{$src, $dst|$dst, $src}",
846 [(store (v2f64 VR128:$src), addr:$dst)],
847 IIC_SSE_MOVU_P_MR>, VEX;
848 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
849 "movaps\t{$src, $dst|$dst, $src}",
850 [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
851 IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
852 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
853 "movapd\t{$src, $dst|$dst, $src}",
854 [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
855 IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
856 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
857 "movups\t{$src, $dst|$dst, $src}",
858 [(store (v8f32 VR256:$src), addr:$dst)],
859 IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
860 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
861 "movupd\t{$src, $dst|$dst, $src}",
862 [(store (v4f64 VR256:$src), addr:$dst)],
863 IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
867 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
868 SchedRW = [WriteFShuffle] in {
869 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
871 "movaps\t{$src, $dst|$dst, $src}", [],
872 IIC_SSE_MOVA_P_RR>, VEX;
873 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
875 "movapd\t{$src, $dst|$dst, $src}", [],
876 IIC_SSE_MOVA_P_RR>, VEX;
877 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
879 "movups\t{$src, $dst|$dst, $src}", [],
880 IIC_SSE_MOVU_P_RR>, VEX;
881 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
883 "movupd\t{$src, $dst|$dst, $src}", [],
884 IIC_SSE_MOVU_P_RR>, VEX;
885 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
887 "movaps\t{$src, $dst|$dst, $src}", [],
888 IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
889 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
891 "movapd\t{$src, $dst|$dst, $src}", [],
892 IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
893 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
895 "movups\t{$src, $dst|$dst, $src}", [],
896 IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
897 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
899 "movupd\t{$src, $dst|$dst, $src}", [],
900 IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
903 // Aliases to help the assembler pick two byte VEX encodings by swapping the
904 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
905 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
906 (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
907 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
908 (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
909 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
910 (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
911 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
912 (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
913 def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
914 (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
915 def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
916 (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
917 def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
918 (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
919 def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
920 (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
922 let SchedRW = [WriteStore] in {
923 def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
924 "movaps\t{$src, $dst|$dst, $src}",
925 [(alignedstore (v4f32 VR128:$src), addr:$dst)],
927 def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
928 "movapd\t{$src, $dst|$dst, $src}",
929 [(alignedstore (v2f64 VR128:$src), addr:$dst)],
931 def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
932 "movups\t{$src, $dst|$dst, $src}",
933 [(store (v4f32 VR128:$src), addr:$dst)],
935 def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
936 "movupd\t{$src, $dst|$dst, $src}",
937 [(store (v2f64 VR128:$src), addr:$dst)],
942 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
943 SchedRW = [WriteFShuffle] in {
944 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
945 "movaps\t{$src, $dst|$dst, $src}", [],
947 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
948 "movapd\t{$src, $dst|$dst, $src}", [],
950 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
951 "movups\t{$src, $dst|$dst, $src}", [],
953 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
954 "movupd\t{$src, $dst|$dst, $src}", [],
958 // Use vmovaps/vmovups for AVX integer load/store.
959 let Predicates = [HasAVX, NoVLX] in {
960 // 128-bit load/store
961 def : Pat<(alignedloadv2i64 addr:$src),
962 (VMOVAPSrm addr:$src)>;
963 def : Pat<(loadv2i64 addr:$src),
964 (VMOVUPSrm addr:$src)>;
966 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
967 (VMOVAPSmr addr:$dst, VR128:$src)>;
968 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
969 (VMOVAPSmr addr:$dst, VR128:$src)>;
970 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
971 (VMOVUPSmr addr:$dst, VR128:$src)>;
972 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
973 (VMOVUPSmr addr:$dst, VR128:$src)>;
975 // 256-bit load/store
976 def : Pat<(alignedloadv4i64 addr:$src),
977 (VMOVAPSYrm addr:$src)>;
978 def : Pat<(loadv4i64 addr:$src),
979 (VMOVUPSYrm addr:$src)>;
980 def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
981 (VMOVAPSYmr addr:$dst, VR256:$src)>;
982 def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
983 (VMOVAPSYmr addr:$dst, VR256:$src)>;
984 def : Pat<(store (v4i64 VR256:$src), addr:$dst),
985 (VMOVUPSYmr addr:$dst, VR256:$src)>;
986 def : Pat<(store (v8i32 VR256:$src), addr:$dst),
987 (VMOVUPSYmr addr:$dst, VR256:$src)>;
989 // Special patterns for storing subvector extracts of lower 128-bits
990 // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
991 def : Pat<(alignedstore (v2f64 (extract_subvector
992 (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
993 (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
994 def : Pat<(alignedstore (v4f32 (extract_subvector
995 (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
996 (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
997 def : Pat<(alignedstore (v2i64 (extract_subvector
998 (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
999 (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1000 def : Pat<(alignedstore (v4i32 (extract_subvector
1001 (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1002 (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1003 def : Pat<(alignedstore (v8i16 (extract_subvector
1004 (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1005 (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1006 def : Pat<(alignedstore (v16i8 (extract_subvector
1007 (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1008 (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1010 def : Pat<(store (v2f64 (extract_subvector
1011 (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1012 (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1013 def : Pat<(store (v4f32 (extract_subvector
1014 (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1015 (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1016 def : Pat<(store (v2i64 (extract_subvector
1017 (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1018 (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1019 def : Pat<(store (v4i32 (extract_subvector
1020 (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1021 (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1022 def : Pat<(store (v8i16 (extract_subvector
1023 (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1024 (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1025 def : Pat<(store (v16i8 (extract_subvector
1026 (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1027 (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1030 let Predicates = [HasAVX, NoVLX] in {
1031 // 128-bit load/store
1032 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1033 (VMOVAPSmr addr:$dst, VR128:$src)>;
1034 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1035 (VMOVAPSmr addr:$dst, VR128:$src)>;
1036 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1037 (VMOVUPSmr addr:$dst, VR128:$src)>;
1038 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1039 (VMOVUPSmr addr:$dst, VR128:$src)>;
1041 // 256-bit load/store
1042 def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
1043 (VMOVAPSYmr addr:$dst, VR256:$src)>;
1044 def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
1045 (VMOVAPSYmr addr:$dst, VR256:$src)>;
1046 def : Pat<(store (v16i16 VR256:$src), addr:$dst),
1047 (VMOVUPSYmr addr:$dst, VR256:$src)>;
1048 def : Pat<(store (v32i8 VR256:$src), addr:$dst),
1049 (VMOVUPSYmr addr:$dst, VR256:$src)>;
1052 // Use movaps / movups for SSE integer load / store (one byte shorter).
1053 // The instructions selected below are then converted to MOVDQA/MOVDQU
1054 // during the SSE domain pass.
1055 let Predicates = [UseSSE1] in {
1056 def : Pat<(alignedloadv2i64 addr:$src),
1057 (MOVAPSrm addr:$src)>;
1058 def : Pat<(loadv2i64 addr:$src),
1059 (MOVUPSrm addr:$src)>;
1061 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1062 (MOVAPSmr addr:$dst, VR128:$src)>;
1063 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1064 (MOVAPSmr addr:$dst, VR128:$src)>;
1065 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1066 (MOVAPSmr addr:$dst, VR128:$src)>;
1067 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1068 (MOVAPSmr addr:$dst, VR128:$src)>;
1069 def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1070 (MOVUPSmr addr:$dst, VR128:$src)>;
1071 def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1072 (MOVUPSmr addr:$dst, VR128:$src)>;
1073 def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1074 (MOVUPSmr addr:$dst, VR128:$src)>;
1075 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1076 (MOVUPSmr addr:$dst, VR128:$src)>;
1079 //===----------------------------------------------------------------------===//
1080 // SSE 1 & 2 - Move Low packed FP Instructions
1081 //===----------------------------------------------------------------------===//
1083 multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
1084 string base_opc, string asm_opr,
1085 InstrItinClass itin> {
1086 def PSrm : PI<opc, MRMSrcMem,
1087 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1088 !strconcat(base_opc, "s", asm_opr),
1090 (psnode VR128:$src1,
1091 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
1092 itin, SSEPackedSingle>, PS,
1093 Sched<[WriteFShuffleLd, ReadAfterLd]>;
1095 def PDrm : PI<opc, MRMSrcMem,
1096 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1097 !strconcat(base_opc, "d", asm_opr),
1098 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
1099 (scalar_to_vector (loadf64 addr:$src2)))))],
1100 itin, SSEPackedDouble>, PD,
1101 Sched<[WriteFShuffleLd, ReadAfterLd]>;
1105 multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
1106 string base_opc, InstrItinClass itin> {
1107 let Predicates = [UseAVX] in
1108 defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1109 "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1112 let Constraints = "$src1 = $dst" in
1113 defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1114 "\t{$src2, $dst|$dst, $src2}",
1118 let AddedComplexity = 20 in {
1119 defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
1123 let SchedRW = [WriteStore] in {
1124 let Predicates = [UseAVX] in {
1125 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1126 "movlps\t{$src, $dst|$dst, $src}",
1127 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
1128 (iPTR 0))), addr:$dst)],
1129 IIC_SSE_MOV_LH>, VEX;
1130 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1131 "movlpd\t{$src, $dst|$dst, $src}",
1132 [(store (f64 (extractelt (v2f64 VR128:$src),
1133 (iPTR 0))), addr:$dst)],
1134 IIC_SSE_MOV_LH>, VEX;
1136 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1137 "movlps\t{$src, $dst|$dst, $src}",
1138 [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
1139 (iPTR 0))), addr:$dst)],
1141 def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1142 "movlpd\t{$src, $dst|$dst, $src}",
1143 [(store (f64 (extractelt (v2f64 VR128:$src),
1144 (iPTR 0))), addr:$dst)],
1148 let Predicates = [UseAVX] in {
1149 // Shuffle with VMOVLPS
1150 def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1151 (VMOVLPSrm VR128:$src1, addr:$src2)>;
1152 def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1153 (VMOVLPSrm VR128:$src1, addr:$src2)>;
1155 // Shuffle with VMOVLPD
1156 def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1157 (VMOVLPDrm VR128:$src1, addr:$src2)>;
1158 def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1159 (VMOVLPDrm VR128:$src1, addr:$src2)>;
1160 def : Pat<(v2f64 (X86Movsd VR128:$src1,
1161 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
1162 (VMOVLPDrm VR128:$src1, addr:$src2)>;
1165 def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1167 (VMOVLPSmr addr:$src1, VR128:$src2)>;
1168 def : Pat<(store (v4i32 (X86Movlps
1169 (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
1170 (VMOVLPSmr addr:$src1, VR128:$src2)>;
1171 def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1173 (VMOVLPDmr addr:$src1, VR128:$src2)>;
1174 def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1176 (VMOVLPDmr addr:$src1, VR128:$src2)>;
1179 let Predicates = [UseSSE1] in {
1180 // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
1181 def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
1182 (iPTR 0))), addr:$src1),
1183 (MOVLPSmr addr:$src1, VR128:$src2)>;
1185 // Shuffle with MOVLPS
1186 def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1187 (MOVLPSrm VR128:$src1, addr:$src2)>;
1188 def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1189 (MOVLPSrm VR128:$src1, addr:$src2)>;
1190 def : Pat<(X86Movlps VR128:$src1,
1191 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1192 (MOVLPSrm VR128:$src1, addr:$src2)>;
1195 def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1197 (MOVLPSmr addr:$src1, VR128:$src2)>;
1198 def : Pat<(store (v4i32 (X86Movlps
1199 (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
1201 (MOVLPSmr addr:$src1, VR128:$src2)>;
1204 let Predicates = [UseSSE2] in {
1205 // Shuffle with MOVLPD
1206 def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1207 (MOVLPDrm VR128:$src1, addr:$src2)>;
1208 def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1209 (MOVLPDrm VR128:$src1, addr:$src2)>;
1210 def : Pat<(v2f64 (X86Movsd VR128:$src1,
1211 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
1212 (MOVLPDrm VR128:$src1, addr:$src2)>;
1215 def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1217 (MOVLPDmr addr:$src1, VR128:$src2)>;
1218 def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1220 (MOVLPDmr addr:$src1, VR128:$src2)>;
1223 //===----------------------------------------------------------------------===//
1224 // SSE 1 & 2 - Move Hi packed FP Instructions
1225 //===----------------------------------------------------------------------===//
1227 let AddedComplexity = 20 in {
1228 defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
1232 let SchedRW = [WriteStore] in {
1233 // v2f64 extract element 1 is always custom lowered to unpack high to low
1234 // and extract element 0 so the non-store version isn't too horrible.
1235 let Predicates = [UseAVX] in {
1236 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1237 "movhps\t{$src, $dst|$dst, $src}",
1238 [(store (f64 (extractelt
1239 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1240 (bc_v2f64 (v4f32 VR128:$src))),
1241 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1242 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1243 "movhpd\t{$src, $dst|$dst, $src}",
1244 [(store (f64 (extractelt
1245 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1246 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1248 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1249 "movhps\t{$src, $dst|$dst, $src}",
1250 [(store (f64 (extractelt
1251 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1252 (bc_v2f64 (v4f32 VR128:$src))),
1253 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1254 def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1255 "movhpd\t{$src, $dst|$dst, $src}",
1256 [(store (f64 (extractelt
1257 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1258 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1261 let Predicates = [UseAVX] in {
1263 def : Pat<(X86Movlhps VR128:$src1,
1264 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1265 (VMOVHPSrm VR128:$src1, addr:$src2)>;
1266 def : Pat<(X86Movlhps VR128:$src1,
1267 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1268 (VMOVHPSrm VR128:$src1, addr:$src2)>;
1272 // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1273 // is during lowering, where it's not possible to recognize the load fold
1274 // cause it has two uses through a bitcast. One use disappears at isel time
1275 // and the fold opportunity reappears.
1276 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1277 (scalar_to_vector (loadf64 addr:$src2)))),
1278 (VMOVHPDrm VR128:$src1, addr:$src2)>;
1280 // Also handle an i64 load because that may get selected as a faster way to
1282 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1283 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
1284 (VMOVHPDrm VR128:$src1, addr:$src2)>;
1286 def : Pat<(store (f64 (extractelt
1287 (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
1288 (iPTR 0))), addr:$dst),
1289 (VMOVHPDmr addr:$dst, VR128:$src)>;
1291 def : Pat<(store (f64 (extractelt
1292 (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
1293 (iPTR 0))), addr:$dst),
1294 (VMOVHPDmr addr:$dst, VR128:$src)>;
1297 let Predicates = [UseSSE1] in {
1299 def : Pat<(X86Movlhps VR128:$src1,
1300 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1301 (MOVHPSrm VR128:$src1, addr:$src2)>;
1302 def : Pat<(X86Movlhps VR128:$src1,
1303 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
1304 (MOVHPSrm VR128:$src1, addr:$src2)>;
1307 let Predicates = [UseSSE2] in {
1310 // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1311 // is during lowering, where it's not possible to recognize the load fold
1312 // cause it has two uses through a bitcast. One use disappears at isel time
1313 // and the fold opportunity reappears.
1314 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1315 (scalar_to_vector (loadf64 addr:$src2)))),
1316 (MOVHPDrm VR128:$src1, addr:$src2)>;
1318 // Also handle an i64 load because that may get selected as a faster way to
1320 def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1321 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
1322 (MOVHPDrm VR128:$src1, addr:$src2)>;
1324 def : Pat<(store (f64 (extractelt
1325 (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
1326 (iPTR 0))), addr:$dst),
1327 (MOVHPDmr addr:$dst, VR128:$src)>;
1329 def : Pat<(store (f64 (extractelt
1330 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
1331 (iPTR 0))), addr:$dst),
1332 (MOVHPDmr addr:$dst, VR128:$src)>;
1335 //===----------------------------------------------------------------------===//
1336 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
1337 //===----------------------------------------------------------------------===//
1339 let AddedComplexity = 20, Predicates = [UseAVX] in {
1340 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
1341 (ins VR128:$src1, VR128:$src2),
1342 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1344 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1346 VEX_4V, Sched<[WriteFShuffle]>;
1347 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
1348 (ins VR128:$src1, VR128:$src2),
1349 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1351 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1353 VEX_4V, Sched<[WriteFShuffle]>;
1355 let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1356 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
1357 (ins VR128:$src1, VR128:$src2),
1358 "movlhps\t{$src2, $dst|$dst, $src2}",
1360 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1361 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1362 let isCommutable = 1 in
1363 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
1364 (ins VR128:$src1, VR128:$src2),
1365 "movhlps\t{$src2, $dst|$dst, $src2}",
1367 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1368 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1371 let Predicates = [UseAVX] in {
1373 def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1374 (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1375 def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1376 (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1379 def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1380 (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1383 let Predicates = [UseSSE1] in {
1385 def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1386 (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1387 def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1388 (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1391 def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1392 (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1395 //===----------------------------------------------------------------------===//
1396 // SSE 1 & 2 - Conversion Instructions
1397 //===----------------------------------------------------------------------===//
1399 def SSE_CVT_PD : OpndItins<
1400 IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
1403 let Sched = WriteCvtI2F in
1404 def SSE_CVT_PS : OpndItins<
1405 IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
1408 let Sched = WriteCvtI2F in
1409 def SSE_CVT_Scalar : OpndItins<
1410 IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
1413 let Sched = WriteCvtF2I in
1414 def SSE_CVT_SS2SI_32 : OpndItins<
1415 IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
1418 let Sched = WriteCvtF2I in
1419 def SSE_CVT_SS2SI_64 : OpndItins<
1420 IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
1423 let Sched = WriteCvtF2I in
1424 def SSE_CVT_SD2SI : OpndItins<
1425 IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
1428 // FIXME: We probably want to match the rm form only when optimizing for
1429 // size, to avoid false depenendecies (see sse_fp_unop_s for details)
1430 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1431 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
1432 string asm, OpndItins itins> {
1433 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1434 [(set DstRC:$dst, (OpNode SrcRC:$src))],
1435 itins.rr>, Sched<[itins.Sched]>;
1436 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1437 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
1438 itins.rm>, Sched<[itins.Sched.Folded]>;
1441 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
1442 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
1443 string asm, Domain d, OpndItins itins> {
1444 let hasSideEffects = 0 in {
1445 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
1446 [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))],
1447 itins.rr, d>, Sched<[itins.Sched]>;
1449 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
1450 [(set RC:$dst, (DstTy (sint_to_fp
1451 (SrcTy (bitconvert (ld_frag addr:$src))))))],
1452 itins.rm, d>, Sched<[itins.Sched.Folded]>;
1456 // FIXME: We probably want to match the rm form only when optimizing for
1457 // size, to avoid false depenendecies (see sse_fp_unop_s for details)
1458 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1459 X86MemOperand x86memop, string asm> {
1460 let hasSideEffects = 0, Predicates = [UseAVX] in {
1461 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
1462 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1463 Sched<[WriteCvtI2F]>;
1465 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1466 (ins DstRC:$src1, x86memop:$src),
1467 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1468 Sched<[WriteCvtI2FLd, ReadAfterLd]>;
1469 } // hasSideEffects = 0
1472 let Predicates = [UseAVX] in {
1473 defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1474 "cvttss2si\t{$src, $dst|$dst, $src}",
1477 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1478 "cvttss2si\t{$src, $dst|$dst, $src}",
1480 XS, VEX, VEX_W, VEX_LIG;
1481 defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1482 "cvttsd2si\t{$src, $dst|$dst, $src}",
1485 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1486 "cvttsd2si\t{$src, $dst|$dst, $src}",
1488 XD, VEX, VEX_W, VEX_LIG;
1490 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1491 (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1492 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1493 (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1494 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1495 (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1496 def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1497 (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1498 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1499 (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1500 def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1501 (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1502 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1503 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1504 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1505 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1507 // The assembler can recognize rr 64-bit instructions by seeing a rxx
1508 // register, but the same isn't true when only using memory operands,
1509 // provide other assembly "l" and "q" forms to address this explicitly
1510 // where appropriate to do so.
1511 defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
1512 XS, VEX_4V, VEX_LIG;
1513 defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
1514 XS, VEX_4V, VEX_W, VEX_LIG;
1515 defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
1516 XD, VEX_4V, VEX_LIG;
1517 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
1518 XD, VEX_4V, VEX_W, VEX_LIG;
1520 let Predicates = [UseAVX] in {
1521 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1522 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
1523 def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1524 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
1526 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
1527 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1528 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
1529 (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
1530 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
1531 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
1532 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
1533 (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
1535 def : Pat<(f32 (sint_to_fp GR32:$src)),
1536 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
1537 def : Pat<(f32 (sint_to_fp GR64:$src)),
1538 (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
1539 def : Pat<(f64 (sint_to_fp GR32:$src)),
1540 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
1541 def : Pat<(f64 (sint_to_fp GR64:$src)),
1542 (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
1545 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1546 "cvttss2si\t{$src, $dst|$dst, $src}",
1547 SSE_CVT_SS2SI_32>, XS;
1548 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1549 "cvttss2si\t{$src, $dst|$dst, $src}",
1550 SSE_CVT_SS2SI_64>, XS, REX_W;
1551 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1552 "cvttsd2si\t{$src, $dst|$dst, $src}",
1554 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1555 "cvttsd2si\t{$src, $dst|$dst, $src}",
1556 SSE_CVT_SD2SI>, XD, REX_W;
1557 defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
1558 "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1559 SSE_CVT_Scalar>, XS;
1560 defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
1561 "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1562 SSE_CVT_Scalar>, XS, REX_W;
1563 defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
1564 "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1565 SSE_CVT_Scalar>, XD;
1566 defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
1567 "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1568 SSE_CVT_Scalar>, XD, REX_W;
1570 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1571 (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1572 def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1573 (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1574 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1575 (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1576 def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1577 (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1578 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1579 (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1580 def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1581 (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1582 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1583 (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1584 def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1585 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1587 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1588 (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
1589 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1590 (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
1592 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
1593 // and/or XMM operand(s).
1595 // FIXME: We probably want to match the rm form only when optimizing for
1596 // size, to avoid false depenendecies (see sse_fp_unop_s for details)
1597 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1598 Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
1599 string asm, OpndItins itins> {
1600 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1601 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1602 [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
1603 Sched<[itins.Sched]>;
1604 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1605 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1606 [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
1607 Sched<[itins.Sched.Folded]>;
1610 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1611 RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
1612 PatFrag ld_frag, string asm, OpndItins itins,
1614 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1616 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1617 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1618 [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
1619 itins.rr>, Sched<[itins.Sched]>;
1620 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1621 (ins DstRC:$src1, x86memop:$src2),
1623 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1624 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1625 [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
1626 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
1629 let Predicates = [UseAVX] in {
1630 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
1631 int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
1632 SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
1633 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
1634 int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
1635 SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
1637 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1638 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
1639 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
1640 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1643 let isCodeGenOnly = 1 in {
1644 let Predicates = [UseAVX] in {
1645 defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1646 int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
1647 SSE_CVT_Scalar, 0>, XS, VEX_4V;
1648 defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1649 int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
1650 SSE_CVT_Scalar, 0>, XS, VEX_4V,
1652 defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1653 int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
1654 SSE_CVT_Scalar, 0>, XD, VEX_4V;
1655 defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1656 int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
1657 SSE_CVT_Scalar, 0>, XD,
1660 let Constraints = "$src1 = $dst" in {
1661 defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1662 int_x86_sse_cvtsi2ss, i32mem, loadi32,
1663 "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
1664 defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1665 int_x86_sse_cvtsi642ss, i64mem, loadi64,
1666 "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
1667 defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1668 int_x86_sse2_cvtsi2sd, i32mem, loadi32,
1669 "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
1670 defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1671 int_x86_sse2_cvtsi642sd, i64mem, loadi64,
1672 "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
1674 } // isCodeGenOnly = 1
1678 // Aliases for intrinsics
1679 let isCodeGenOnly = 1 in {
1680 let Predicates = [UseAVX] in {
1681 defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1682 ssmem, sse_load_f32, "cvttss2si",
1683 SSE_CVT_SS2SI_32>, XS, VEX;
1684 defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1685 int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1686 "cvttss2si", SSE_CVT_SS2SI_64>,
1688 defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1689 sdmem, sse_load_f64, "cvttsd2si",
1690 SSE_CVT_SD2SI>, XD, VEX;
1691 defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1692 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1693 "cvttsd2si", SSE_CVT_SD2SI>,
1696 defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1697 ssmem, sse_load_f32, "cvttss2si",
1698 SSE_CVT_SS2SI_32>, XS;
1699 defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1700 int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1701 "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
1702 defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1703 sdmem, sse_load_f64, "cvttsd2si",
1705 defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1706 int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1707 "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1708 } // isCodeGenOnly = 1
1710 let Predicates = [UseAVX] in {
1711 defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1712 ssmem, sse_load_f32, "cvtss2si",
1713 SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
1714 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1715 ssmem, sse_load_f32, "cvtss2si",
1716 SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
1718 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1719 ssmem, sse_load_f32, "cvtss2si",
1720 SSE_CVT_SS2SI_32>, XS;
1721 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1722 ssmem, sse_load_f32, "cvtss2si",
1723 SSE_CVT_SS2SI_64>, XS, REX_W;
1725 defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
1726 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1727 SSEPackedSingle, SSE_CVT_PS>,
1728 PS, VEX, Requires<[HasAVX, NoVLX]>;
1729 defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
1730 "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1731 SSEPackedSingle, SSE_CVT_PS>,
1732 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>;
1734 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
1735 "cvtdq2ps\t{$src, $dst|$dst, $src}",
1736 SSEPackedSingle, SSE_CVT_PS>,
1737 PS, Requires<[UseSSE2]>;
1739 let Predicates = [UseAVX] in {
1740 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1741 (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1742 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1743 (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1744 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1745 (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1746 def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1747 (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1748 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1749 (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1750 def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1751 (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1752 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1753 (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1754 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1755 (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
1758 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1759 (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1760 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1761 (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1762 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1763 (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1764 def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1765 (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1766 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1767 (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1768 def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1769 (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1770 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1771 (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1772 def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1773 (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
1777 // Convert scalar double to scalar single
1778 let hasSideEffects = 0, Predicates = [UseAVX] in {
1779 def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1780 (ins FR64:$src1, FR64:$src2),
1781 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
1782 IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
1783 Sched<[WriteCvtF2F]>;
1785 def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1786 (ins FR64:$src1, f64mem:$src2),
1787 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1788 [], IIC_SSE_CVT_Scalar_RM>,
1789 XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
1790 Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1793 def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
1796 def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1797 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1798 [(set FR32:$dst, (fpround FR64:$src))],
1799 IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
1800 def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1801 "cvtsd2ss\t{$src, $dst|$dst, $src}",
1802 [(set FR32:$dst, (fpround (loadf64 addr:$src)))],
1803 IIC_SSE_CVT_Scalar_RM>,
1805 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1807 let isCodeGenOnly = 1 in {
1808 def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
1809 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1810 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1812 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1813 IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
1814 Sched<[WriteCvtF2F]>;
1815 def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
1816 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1817 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1818 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1819 VR128:$src1, sse_load_f64:$src2))],
1820 IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
1821 Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1823 let Constraints = "$src1 = $dst" in {
1824 def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
1825 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1826 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1828 (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1829 IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
1830 Sched<[WriteCvtF2F]>;
1831 def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
1832 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1833 "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1834 [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1835 VR128:$src1, sse_load_f64:$src2))],
1836 IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
1837 Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1839 } // isCodeGenOnly = 1
1841 // Convert scalar single to scalar double
1842 // SSE2 instructions with XS prefix
1843 let hasSideEffects = 0, Predicates = [UseAVX] in {
1844 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1845 (ins FR32:$src1, FR32:$src2),
1846 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1847 [], IIC_SSE_CVT_Scalar_RR>,
1848 XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
1849 Sched<[WriteCvtF2F]>;
1851 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1852 (ins FR32:$src1, f32mem:$src2),
1853 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1854 [], IIC_SSE_CVT_Scalar_RM>,
1855 XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
1856 Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1859 def : Pat<(f64 (fpextend FR32:$src)),
1860 (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
1861 def : Pat<(fpextend (loadf32 addr:$src)),
1862 (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
1864 def : Pat<(extloadf32 addr:$src),
1865 (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
1866 Requires<[UseAVX, OptForSize]>;
1867 def : Pat<(extloadf32 addr:$src),
1868 (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1869 Requires<[UseAVX, OptForSpeed]>;
1871 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1872 "cvtss2sd\t{$src, $dst|$dst, $src}",
1873 [(set FR64:$dst, (fpextend FR32:$src))],
1874 IIC_SSE_CVT_Scalar_RR>, XS,
1875 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
1876 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1877 "cvtss2sd\t{$src, $dst|$dst, $src}",
1878 [(set FR64:$dst, (extloadf32 addr:$src))],
1879 IIC_SSE_CVT_Scalar_RM>, XS,
1880 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1882 // extload f32 -> f64. This matches load+fpextend because we have a hack in
1883 // the isel (PreprocessForFPConvert) that can introduce loads after dag
1885 // Since these loads aren't folded into the fpextend, we have to match it
1887 def : Pat<(fpextend (loadf32 addr:$src)),
1888 (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
1889 def : Pat<(extloadf32 addr:$src),
1890 (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1892 let isCodeGenOnly = 1 in {
1893 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
1894 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1895 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1897 (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1898 IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
1899 Sched<[WriteCvtF2F]>;
1900 def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
1901 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1902 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1904 (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1905 IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
1906 Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1907 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1908 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1909 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1910 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1912 (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1913 IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
1914 Sched<[WriteCvtF2F]>;
1915 def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1916 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1917 "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1919 (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1920 IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
1921 Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1923 } // isCodeGenOnly = 1
1925 // Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1926 // (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1927 // vmovs{s,d} instructions
1928 let Predicates = [UseAVX] in {
1929 def : Pat<(v4f32 (X86Movss
1931 (v4f32 (scalar_to_vector
1932 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1933 (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>;
1935 def : Pat<(v2f64 (X86Movsd
1937 (v2f64 (scalar_to_vector
1938 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1939 (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>;
1941 def : Pat<(v4f32 (X86Movss
1943 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1944 (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>;
1946 def : Pat<(v4f32 (X86Movss
1948 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1949 (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>;
1951 def : Pat<(v2f64 (X86Movsd
1953 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1954 (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>;
1956 def : Pat<(v2f64 (X86Movsd
1958 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1959 (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>;
1960 } // Predicates = [UseAVX]
1962 let Predicates = [UseSSE2] in {
1963 def : Pat<(v4f32 (X86Movss
1965 (v4f32 (scalar_to_vector
1966 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1967 (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>;
1969 def : Pat<(v2f64 (X86Movsd
1971 (v2f64 (scalar_to_vector
1972 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1973 (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>;
1975 def : Pat<(v2f64 (X86Movsd
1977 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1978 (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>;
1980 def : Pat<(v2f64 (X86Movsd
1982 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1983 (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>;
1984 } // Predicates = [UseSSE2]
1986 let Predicates = [UseSSE1] in {
1987 def : Pat<(v4f32 (X86Movss
1989 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1990 (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>;
1992 def : Pat<(v4f32 (X86Movss
1994 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1995 (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>;
1996 } // Predicates = [UseSSE1]
1998 // Convert packed single/double fp to doubleword
1999 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2000 "cvtps2dq\t{$src, $dst|$dst, $src}",
2001 [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
2002 IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
2003 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2004 "cvtps2dq\t{$src, $dst|$dst, $src}",
2006 (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
2007 IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2008 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2009 "cvtps2dq\t{$src, $dst|$dst, $src}",
2011 (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
2012 IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2013 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2014 "cvtps2dq\t{$src, $dst|$dst, $src}",
2016 (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
2017 IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2018 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2019 "cvtps2dq\t{$src, $dst|$dst, $src}",
2020 [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
2021 IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
2022 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2023 "cvtps2dq\t{$src, $dst|$dst, $src}",
2025 (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
2026 IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
2029 // Convert Packed Double FP to Packed DW Integers
2030 let Predicates = [HasAVX, NoVLX] in {
2031 // The assembler can recognize rr 256-bit instructions by seeing a ymm
2032 // register, but the same isn't true when using memory operands instead.
2033 // Provide other assembly rr and rm forms to address this explicitly.
2034 def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2035 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
2037 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
2038 VEX, Sched<[WriteCvtF2I]>;
2041 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2042 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
2043 def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2044 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
2046 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
2047 Sched<[WriteCvtF2ILd]>;
2048 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2049 (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
2052 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2053 "vcvtpd2dq\t{$src, $dst|$dst, $src}",
2055 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
2056 VEX, VEX_L, Sched<[WriteCvtF2I]>;
2057 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2058 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
2060 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
2061 VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2062 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
2063 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
2064 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
2065 (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
2068 def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2069 "cvtpd2dq\t{$src, $dst|$dst, $src}",
2071 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))],
2072 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
2073 def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2074 "cvtpd2dq\t{$src, $dst|$dst, $src}",
2076 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))],
2077 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2079 // Convert with truncation packed single/double fp to doubleword
2080 // SSE2 packed instructions with XS prefix
2081 let Predicates = [HasAVX, NoVLX] in {
2082 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2083 "cvttps2dq\t{$src, $dst|$dst, $src}",
2085 (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
2086 IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
2087 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2088 "cvttps2dq\t{$src, $dst|$dst, $src}",
2090 (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
2091 IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2092 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2093 "cvttps2dq\t{$src, $dst|$dst, $src}",
2095 (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
2096 IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2097 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2098 "cvttps2dq\t{$src, $dst|$dst, $src}",
2100 (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
2101 IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
2102 Sched<[WriteCvtF2ILd]>;
2105 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2106 "cvttps2dq\t{$src, $dst|$dst, $src}",
2108 (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
2109 IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
2110 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2111 "cvttps2dq\t{$src, $dst|$dst, $src}",
2113 (v4i32 (fp_to_sint (memopv4f32 addr:$src))))],
2114 IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
2116 let Predicates = [HasAVX, NoVLX] in
2117 def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2118 "cvttpd2dq\t{$src, $dst|$dst, $src}",
2120 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
2121 IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
2123 // The assembler can recognize rr 256-bit instructions by seeing a ymm
2124 // register, but the same isn't true when using memory operands instead.
2125 // Provide other assembly rr and rm forms to address this explicitly.
2128 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
2129 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
2130 let Predicates = [HasAVX, NoVLX] in
2131 def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2132 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
2134 (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
2135 IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2136 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
2137 (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
2140 let Predicates = [HasAVX, NoVLX] in {
2141 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2142 "cvttpd2dq\t{$src, $dst|$dst, $src}",
2144 (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
2145 IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2146 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2147 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2149 (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
2150 IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2152 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
2153 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
2154 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
2155 (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
2157 let Predicates = [HasAVX, NoVLX] in {
2158 let AddedComplexity = 15 in {
2159 def : Pat<(X86vzmovl (v2i64 (bitconvert
2160 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
2161 (VCVTPD2DQrr VR128:$src)>;
2162 def : Pat<(X86vzmovl (v2i64 (bitconvert
2163 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
2164 (VCVTTPD2DQrr VR128:$src)>;
2166 } // Predicates = [HasAVX]
2168 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2169 "cvttpd2dq\t{$src, $dst|$dst, $src}",
2171 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
2172 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2173 def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
2174 "cvttpd2dq\t{$src, $dst|$dst, $src}",
2176 (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))],
2177 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
2179 let Predicates = [UseSSE2] in {
2180 let AddedComplexity = 15 in {
2181 def : Pat<(X86vzmovl (v2i64 (bitconvert
2182 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
2183 (CVTPD2DQrr VR128:$src)>;
2184 def : Pat<(X86vzmovl (v2i64 (bitconvert
2185 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
2186 (CVTTPD2DQrr VR128:$src)>;
2188 } // Predicates = [UseSSE2]
2190 // Convert packed single to packed double
2191 let Predicates = [HasAVX, NoVLX] in {
2192 // SSE2 instructions without OpSize prefix
2193 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2194 "vcvtps2pd\t{$src, $dst|$dst, $src}",
2195 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
2196 IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
2197 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2198 "vcvtps2pd\t{$src, $dst|$dst, $src}",
2199 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2200 IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
2201 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2202 "vcvtps2pd\t{$src, $dst|$dst, $src}",
2203 [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
2204 IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2205 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
2206 "vcvtps2pd\t{$src, $dst|$dst, $src}",
2207 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
2208 IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2211 let Predicates = [UseSSE2] in {
2212 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2213 "cvtps2pd\t{$src, $dst|$dst, $src}",
2214 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
2215 IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
2216 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2217 "cvtps2pd\t{$src, $dst|$dst, $src}",
2218 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2219 IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
2222 // Convert Packed DW Integers to Packed Double FP
2223 let Predicates = [HasAVX, NoVLX] in {
2224 let hasSideEffects = 0, mayLoad = 1 in
2225 def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2226 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2228 (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
2229 VEX, Sched<[WriteCvtI2FLd]>;
2230 def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2231 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2233 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
2234 VEX, Sched<[WriteCvtI2F]>;
2235 def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
2236 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2238 (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
2239 VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
2240 def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2241 "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2243 (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
2244 VEX, VEX_L, Sched<[WriteCvtI2F]>;
2247 let hasSideEffects = 0, mayLoad = 1 in
2248 def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2249 "cvtdq2pd\t{$src, $dst|$dst, $src}",
2251 (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
2252 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
2253 def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2254 "cvtdq2pd\t{$src, $dst|$dst, $src}",
2256 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))],
2257 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
2259 // AVX register conversion intrinsics
2260 let Predicates = [HasAVX, NoVLX] in {
2261 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
2262 (VCVTDQ2PDrm addr:$src)>;
2263 } // Predicates = [HasAVX, NoVLX]
2265 // SSE2 register conversion intrinsics
2266 let Predicates = [UseSSE2] in {
2267 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
2268 (CVTDQ2PDrm addr:$src)>;
2269 } // Predicates = [UseSSE2]
2271 // Convert packed double to packed single
2272 // The assembler can recognize rr 256-bit instructions by seeing a ymm
2273 // register, but the same isn't true when using memory operands instead.
2274 // Provide other assembly rr and rm forms to address this explicitly.
2275 let Predicates = [HasAVX, NoVLX] in
2276 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2277 "cvtpd2ps\t{$src, $dst|$dst, $src}",
2278 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
2279 IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
2282 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
2283 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
2284 let Predicates = [HasAVX, NoVLX] in
2285 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2286 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
2287 [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
2288 IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
2289 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
2290 (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
2293 let Predicates = [HasAVX, NoVLX] in {
2294 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2295 "cvtpd2ps\t{$src, $dst|$dst, $src}",
2296 [(set VR128:$dst, (fpround VR256:$src))],
2297 IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2298 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2299 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2300 [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
2301 IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2303 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
2304 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
2305 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
2306 (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0>;
2308 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2309 "cvtpd2ps\t{$src, $dst|$dst, $src}",
2310 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
2311 IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
2312 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2313 "cvtpd2ps\t{$src, $dst|$dst, $src}",
2314 [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))],
2315 IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
2317 // AVX 256-bit register conversion intrinsics
2318 // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
2319 // whenever possible to avoid declaring two versions of each one.
2321 let Predicates = [HasAVX, NoVLX] in {
2322 // Match fpround and fpextend for 128/256-bit conversions
2323 let AddedComplexity = 15 in
2324 def : Pat<(X86vzmovl (v2f64 (bitconvert
2325 (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
2326 (VCVTPD2PSrr VR128:$src)>;
2329 let Predicates = [UseSSE2] in {
2330 // Match fpround and fpextend for 128 conversions
2331 let AddedComplexity = 15 in
2332 def : Pat<(X86vzmovl (v2f64 (bitconvert
2333 (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
2334 (CVTPD2PSrr VR128:$src)>;
2337 //===----------------------------------------------------------------------===//
2338 // SSE 1 & 2 - Compare Instructions
2339 //===----------------------------------------------------------------------===//
2341 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
2342 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
2343 Operand CC, SDNode OpNode, ValueType VT,
2344 PatFrag ld_frag, string asm, string asm_alt,
2345 OpndItins itins, ImmLeaf immLeaf> {
2346 let isCommutable = 1 in
2347 def rr : SIi8<0xC2, MRMSrcReg,
2348 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2349 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
2350 itins.rr>, Sched<[itins.Sched]>;
2351 def rm : SIi8<0xC2, MRMSrcMem,
2352 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2353 [(set RC:$dst, (OpNode (VT RC:$src1),
2354 (ld_frag addr:$src2), immLeaf:$cc))],
2356 Sched<[itins.Sched.Folded, ReadAfterLd]>;
2358 // Accept explicit immediate argument form instead of comparison code.
2359 let isAsmParserOnly = 1, hasSideEffects = 0 in {
2360 def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
2361 (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
2362 IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
2364 def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
2365 (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
2366 IIC_SSE_ALU_F32S_RM>,
2367 Sched<[itins.Sched.Folded, ReadAfterLd]>;
2371 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
2372 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2373 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2374 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
2375 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
2376 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2377 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2378 SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
2379 XD, VEX_4V, VEX_LIG;
2381 let Constraints = "$src1 = $dst" in {
2382 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
2383 "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
2384 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
2386 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
2387 "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
2388 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2389 SSE_ALU_F64S, i8immZExt3>, XD;
2392 multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
2393 Intrinsic Int, string asm, OpndItins itins,
2394 ImmLeaf immLeaf, ComplexPattern mem_cpat> {
2395 def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
2396 (ins VR128:$src1, VR128:$src, CC:$cc), asm,
2397 [(set VR128:$dst, (Int VR128:$src1,
2398 VR128:$src, immLeaf:$cc))],
2400 Sched<[itins.Sched]>;
2401 def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
2402 (ins VR128:$src1, memop:$src, CC:$cc), asm,
2403 [(set VR128:$dst, (Int VR128:$src1,
2404 mem_cpat:$src, immLeaf:$cc))],
2406 Sched<[itins.Sched.Folded, ReadAfterLd]>;
2409 let isCodeGenOnly = 1 in {
2410 // Aliases to match intrinsics which expect XMM operand(s).
2411 defm Int_VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
2412 "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
2413 SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
2415 defm Int_VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
2416 "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
2417 SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
2419 let Constraints = "$src1 = $dst" in {
2420 defm Int_CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
2421 "cmp${cc}ss\t{$src, $dst|$dst, $src}",
2422 SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
2423 defm Int_CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
2424 "cmp${cc}sd\t{$src, $dst|$dst, $src}",
2425 SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
2431 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
2432 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
2433 ValueType vt, X86MemOperand x86memop,
2434 PatFrag ld_frag, string OpcodeStr> {
2435 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
2436 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2437 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
2440 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
2441 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2442 [(set EFLAGS, (OpNode (vt RC:$src1),
2443 (ld_frag addr:$src2)))],
2445 Sched<[WriteFAddLd, ReadAfterLd]>;
2448 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
2449 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
2450 ValueType vt, Operand memop,
2451 ComplexPattern mem_cpat, string OpcodeStr> {
2452 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
2453 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2454 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
2457 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
2458 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2459 [(set EFLAGS, (OpNode (vt RC:$src1),
2462 Sched<[WriteFAddLd, ReadAfterLd]>;
2465 let Defs = [EFLAGS] in {
2466 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2467 "ucomiss">, PS, VEX, VEX_LIG;
2468 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2469 "ucomisd">, PD, VEX, VEX_LIG;
2470 let Pattern = []<dag> in {
2471 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
2472 "comiss">, PS, VEX, VEX_LIG;
2473 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
2474 "comisd">, PD, VEX, VEX_LIG;
2477 let isCodeGenOnly = 1 in {
2478 defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
2479 sse_load_f32, "ucomiss">, PS, VEX;
2480 defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
2481 sse_load_f64, "ucomisd">, PD, VEX;
2483 defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
2484 sse_load_f32, "comiss">, PS, VEX;
2485 defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
2486 sse_load_f64, "comisd">, PD, VEX;
2488 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2490 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2493 let Pattern = []<dag> in {
2494 defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
2496 defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
2500 let isCodeGenOnly = 1 in {
2501 defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
2502 sse_load_f32, "ucomiss">, PS;
2503 defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
2504 sse_load_f64, "ucomisd">, PD;
2506 defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
2507 sse_load_f32, "comiss">, PS;
2508 defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
2509 sse_load_f64, "comisd">, PD;
2511 } // Defs = [EFLAGS]
2513 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
2514 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
2515 Operand CC, Intrinsic Int, string asm,
2516 string asm_alt, Domain d, ImmLeaf immLeaf,
2517 PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
2518 let isCommutable = 1 in
2519 def rri : PIi8<0xC2, MRMSrcReg,
2520 (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2521 [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
2524 def rmi : PIi8<0xC2, MRMSrcMem,
2525 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2526 [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
2528 Sched<[WriteFAddLd, ReadAfterLd]>;
2530 // Accept explicit immediate argument form instead of comparison code.
2531 let isAsmParserOnly = 1, hasSideEffects = 0 in {
2532 def rri_alt : PIi8<0xC2, MRMSrcReg,
2533 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
2534 asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
2536 def rmi_alt : PIi8<0xC2, MRMSrcMem,
2537 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
2538 asm_alt, [], itins.rm, d>,
2539 Sched<[WriteFAddLd, ReadAfterLd]>;
2543 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
2544 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2545 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2546 SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
2547 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
2548 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2549 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2550 SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
2551 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
2552 "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2553 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2554 SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
2555 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
2556 "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2557 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2558 SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
2559 let Constraints = "$src1 = $dst" in {
2560 defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
2561 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2562 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2563 SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
2564 defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
2565 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2566 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2567 SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
2570 let Predicates = [HasAVX] in {
2571 def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2572 (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2573 def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
2574 (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2575 def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2576 (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2577 def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
2578 (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2580 def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
2581 (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
2582 def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
2583 (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
2584 def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
2585 (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
2586 def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
2587 (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2590 let Predicates = [UseSSE1] in {
2591 def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2592 (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2593 def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
2594 (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2597 let Predicates = [UseSSE2] in {
2598 def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2599 (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2600 def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
2601 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2604 //===----------------------------------------------------------------------===//
2605 // SSE 1 & 2 - Shuffle Instructions
2606 //===----------------------------------------------------------------------===//
2608 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2609 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2610 ValueType vt, string asm, PatFrag mem_frag,
2612 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2613 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2614 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2615 (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2616 Sched<[WriteFShuffleLd, ReadAfterLd]>;
2617 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2618 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2619 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2620 (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2621 Sched<[WriteFShuffle]>;
2624 let Predicates = [HasAVX, NoVLX] in {
2625 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2626 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2627 loadv4f32, SSEPackedSingle>, PS, VEX_4V;
2628 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2629 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2630 loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
2631 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2632 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2633 loadv2f64, SSEPackedDouble>, PD, VEX_4V;
2634 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2635 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2636 loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
2638 let Constraints = "$src1 = $dst" in {
2639 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2640 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2641 memopv4f32, SSEPackedSingle>, PS;
2642 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2643 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2644 memopv2f64, SSEPackedDouble>, PD;
2647 let Predicates = [HasAVX, NoVLX] in {
2648 def : Pat<(v4i32 (X86Shufp VR128:$src1,
2649 (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
2650 (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2651 def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2652 (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2654 def : Pat<(v2i64 (X86Shufp VR128:$src1,
2655 (loadv2i64 addr:$src2), (i8 imm:$imm))),
2656 (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2657 def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2658 (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2661 def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2662 (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2663 def : Pat<(v8i32 (X86Shufp VR256:$src1,
2664 (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
2665 (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2667 def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2668 (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2669 def : Pat<(v4i64 (X86Shufp VR256:$src1,
2670 (loadv4i64 addr:$src2), (i8 imm:$imm))),
2671 (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2674 let Predicates = [UseSSE1] in {
2675 def : Pat<(v4i32 (X86Shufp VR128:$src1,
2676 (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2677 (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2678 def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2679 (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2682 let Predicates = [UseSSE2] in {
2683 // Generic SHUFPD patterns
2684 def : Pat<(v2i64 (X86Shufp VR128:$src1,
2685 (memopv2i64 addr:$src2), (i8 imm:$imm))),
2686 (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2687 def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2688 (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2691 //===----------------------------------------------------------------------===//
2692 // SSE 1 & 2 - Unpack FP Instructions
2693 //===----------------------------------------------------------------------===//
2695 /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2696 multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2697 PatFrag mem_frag, RegisterClass RC,
2698 X86MemOperand x86memop, string asm,
2699 Domain d, bit IsCommutable = 0> {
2700 let isCommutable = IsCommutable in
2701 def rr : PI<opc, MRMSrcReg,
2702 (outs RC:$dst), (ins RC:$src1, RC:$src2),
2704 (vt (OpNode RC:$src1, RC:$src2)))],
2705 IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
2706 def rm : PI<opc, MRMSrcMem,
2707 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2709 (vt (OpNode RC:$src1,
2710 (mem_frag addr:$src2))))],
2712 Sched<[WriteFShuffleLd, ReadAfterLd]>;
2715 let Predicates = [HasAVX, NoVLX] in {
2716 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
2717 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2718 SSEPackedSingle>, PS, VEX_4V;
2719 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
2720 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2721 SSEPackedDouble>, PD, VEX_4V;
2722 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
2723 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2724 SSEPackedSingle>, PS, VEX_4V;
2725 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
2726 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2727 SSEPackedDouble>, PD, VEX_4V;
2729 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
2730 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2731 SSEPackedSingle>, PS, VEX_4V, VEX_L;
2732 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
2733 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2734 SSEPackedDouble>, PD, VEX_4V, VEX_L;
2735 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
2736 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2737 SSEPackedSingle>, PS, VEX_4V, VEX_L;
2738 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
2739 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2740 SSEPackedDouble>, PD, VEX_4V, VEX_L;
2741 }// Predicates = [HasAVX, NoVLX]
2742 let Constraints = "$src1 = $dst" in {
2743 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
2744 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2745 SSEPackedSingle>, PS;
2746 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
2747 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2748 SSEPackedDouble, 1>, PD;
2749 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
2750 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2751 SSEPackedSingle>, PS;
2752 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
2753 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2754 SSEPackedDouble>, PD;
2755 } // Constraints = "$src1 = $dst"
2757 let Predicates = [HasAVX1Only] in {
2758 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2759 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2760 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2761 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2762 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2763 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2764 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2765 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2767 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2768 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2769 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2770 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2771 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2772 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2773 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2774 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2777 //===----------------------------------------------------------------------===//
2778 // SSE 1 & 2 - Extract Floating-Point Sign mask
2779 //===----------------------------------------------------------------------===//
2781 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2782 multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2783 string asm, Domain d> {
2784 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2785 !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2786 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>,
2787 Sched<[WriteVecLogic]>;
2790 let Predicates = [HasAVX] in {
2791 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2792 SSEPackedSingle>, PS, VEX;
2793 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2794 SSEPackedDouble>, PD, VEX;
2795 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2796 SSEPackedSingle>, PS, VEX, VEX_L;
2797 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2798 SSEPackedDouble>, PD, VEX, VEX_L;
2801 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2802 SSEPackedSingle>, PS;
2803 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2804 SSEPackedDouble>, PD;
2806 //===---------------------------------------------------------------------===//
2807 // SSE2 - Packed Integer Logical Instructions
2808 //===---------------------------------------------------------------------===//
2810 let ExeDomain = SSEPackedInt in { // SSE integer instructions
2812 /// PDI_binop_rm - Simple SSE2 binary operator.
2813 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2814 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2815 X86MemOperand x86memop, OpndItins itins,
2816 bit IsCommutable, bit Is2Addr> {
2817 let isCommutable = IsCommutable in
2818 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2819 (ins RC:$src1, RC:$src2),
2821 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2822 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2823 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
2824 Sched<[itins.Sched]>;
2825 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2826 (ins RC:$src1, x86memop:$src2),
2828 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2829 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2830 [(set RC:$dst, (OpVT (OpNode RC:$src1,
2831 (bitconvert (memop_frag addr:$src2)))))],
2833 Sched<[itins.Sched.Folded, ReadAfterLd]>;
2835 } // ExeDomain = SSEPackedInt
2837 multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2838 ValueType OpVT128, ValueType OpVT256,
2839 OpndItins itins, bit IsCommutable = 0, Predicate prd> {
2840 let Predicates = [HasAVX, prd] in
2841 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2842 VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
2844 let Constraints = "$src1 = $dst" in
2845 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2846 memopv2i64, i128mem, itins, IsCommutable, 1>;
2848 let Predicates = [HasAVX2, prd] in
2849 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2850 OpVT256, VR256, loadv4i64, i256mem, itins,
2851 IsCommutable, 0>, VEX_4V, VEX_L;
2854 // These are ordered here for pattern ordering requirements with the fp versions
2856 defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2857 SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
2858 defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2859 SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
2860 defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2861 SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
2862 defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2863 SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
2865 //===----------------------------------------------------------------------===//
2866 // SSE 1 & 2 - Logical Instructions
2867 //===----------------------------------------------------------------------===//
2869 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2871 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2873 let Predicates = [HasAVX, NoVLX] in {
2874 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2875 !strconcat(OpcodeStr, "ps"), f256mem,
2876 [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
2877 (bc_v4i64 (v8f32 VR256:$src2))))],
2878 [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
2879 (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
2881 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2882 !strconcat(OpcodeStr, "pd"), f256mem,
2883 [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2884 (bc_v4i64 (v4f64 VR256:$src2))))],
2885 [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2886 (loadv4i64 addr:$src2)))], 0>,
2889 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2890 !strconcat(OpcodeStr, "ps"), f128mem,
2891 [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2892 (bc_v2i64 (v4f32 VR128:$src2))))],
2893 [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2894 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
2896 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2897 !strconcat(OpcodeStr, "pd"), f128mem,
2898 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2899 (bc_v2i64 (v2f64 VR128:$src2))))],
2900 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2901 (loadv2i64 addr:$src2)))], 0>,
2905 let Constraints = "$src1 = $dst" in {
2906 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2907 !strconcat(OpcodeStr, "ps"), f128mem,
2908 [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2909 (bc_v2i64 (v4f32 VR128:$src2))))],
2910 [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2911 (memopv2i64 addr:$src2)))]>, PS;
2913 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2914 !strconcat(OpcodeStr, "pd"), f128mem,
2915 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2916 (bc_v2i64 (v2f64 VR128:$src2))))],
2917 [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2918 (memopv2i64 addr:$src2)))]>, PD;
2922 defm AND : sse12_fp_packed_logical<0x54, "and", and>;
2923 defm OR : sse12_fp_packed_logical<0x56, "or", or>;
2924 defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
2925 let isCommutable = 0 in
2926 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
2928 // If only AVX1 is supported, we need to handle integer operations with
2929 // floating point instructions since the integer versions aren't available.
2930 let Predicates = [HasAVX1Only] in {
2931 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2932 (VANDPSYrr VR256:$src1, VR256:$src2)>;
2933 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2934 (VORPSYrr VR256:$src1, VR256:$src2)>;
2935 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2936 (VXORPSYrr VR256:$src1, VR256:$src2)>;
2937 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2938 (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2940 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2941 (VANDPSYrm VR256:$src1, addr:$src2)>;
2942 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2943 (VORPSYrm VR256:$src1, addr:$src2)>;
2944 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2945 (VXORPSYrm VR256:$src1, addr:$src2)>;
2946 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2947 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2950 let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
2951 // Use packed logical operations for scalar ops.
2952 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
2953 (COPY_TO_REGCLASS (VANDPDrr
2954 (COPY_TO_REGCLASS FR64:$src1, VR128),
2955 (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
2956 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
2957 (COPY_TO_REGCLASS (VORPDrr
2958 (COPY_TO_REGCLASS FR64:$src1, VR128),
2959 (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
2960 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
2961 (COPY_TO_REGCLASS (VXORPDrr
2962 (COPY_TO_REGCLASS FR64:$src1, VR128),
2963 (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
2964 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
2965 (COPY_TO_REGCLASS (VANDNPDrr
2966 (COPY_TO_REGCLASS FR64:$src1, VR128),
2967 (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
2969 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2970 (COPY_TO_REGCLASS (VANDPSrr
2971 (COPY_TO_REGCLASS FR32:$src1, VR128),
2972 (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
2973 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2974 (COPY_TO_REGCLASS (VORPSrr
2975 (COPY_TO_REGCLASS FR32:$src1, VR128),
2976 (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
2977 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2978 (COPY_TO_REGCLASS (VXORPSrr
2979 (COPY_TO_REGCLASS FR32:$src1, VR128),
2980 (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
2981 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
2982 (COPY_TO_REGCLASS (VANDNPSrr
2983 (COPY_TO_REGCLASS FR32:$src1, VR128),
2984 (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
2987 let Predicates = [UseSSE1] in {
2988 // Use packed logical operations for scalar ops.
2989 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2990 (COPY_TO_REGCLASS (ANDPSrr
2991 (COPY_TO_REGCLASS FR32:$src1, VR128),
2992 (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
2993 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2994 (COPY_TO_REGCLASS (ORPSrr
2995 (COPY_TO_REGCLASS FR32:$src1, VR128),
2996 (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
2997 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2998 (COPY_TO_REGCLASS (XORPSrr
2999 (COPY_TO_REGCLASS FR32:$src1, VR128),
3000 (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
3001 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
3002 (COPY_TO_REGCLASS (ANDNPSrr
3003 (COPY_TO_REGCLASS FR32:$src1, VR128),
3004 (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
3007 let Predicates = [UseSSE2] in {
3008 // Use packed logical operations for scalar ops.
3009 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
3010 (COPY_TO_REGCLASS (ANDPDrr
3011 (COPY_TO_REGCLASS FR64:$src1, VR128),
3012 (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
3013 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
3014 (COPY_TO_REGCLASS (ORPDrr
3015 (COPY_TO_REGCLASS FR64:$src1, VR128),
3016 (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
3017 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
3018 (COPY_TO_REGCLASS (XORPDrr
3019 (COPY_TO_REGCLASS FR64:$src1, VR128),
3020 (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
3021 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
3022 (COPY_TO_REGCLASS (ANDNPDrr
3023 (COPY_TO_REGCLASS FR64:$src1, VR128),
3024 (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
3027 // Patterns for packed operations when we don't have integer type available.
3028 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
3029 (ANDPSrr VR128:$src1, VR128:$src2)>;
3030 def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
3031 (ORPSrr VR128:$src1, VR128:$src2)>;
3032 def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
3033 (XORPSrr VR128:$src1, VR128:$src2)>;
3034 def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
3035 (ANDNPSrr VR128:$src1, VR128:$src2)>;
3037 def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
3038 (ANDPSrm VR128:$src1, addr:$src2)>;
3039 def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
3040 (ORPSrm VR128:$src1, addr:$src2)>;
3041 def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
3042 (XORPSrm VR128:$src1, addr:$src2)>;
3043 def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
3044 (ANDNPSrm VR128:$src1, addr:$src2)>;
3046 //===----------------------------------------------------------------------===//
3047 // SSE 1 & 2 - Arithmetic Instructions
3048 //===----------------------------------------------------------------------===//
3050 /// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
3053 /// In addition, we also have a special variant of the scalar form here to
3054 /// represent the associated intrinsic operation. This form is unlike the
3055 /// plain scalar form, in that it takes an entire vector (instead of a scalar)
3056 /// and leaves the top elements unmodified (therefore these cannot be commuted).
3058 /// These three forms can each be reg+reg or reg+mem.
3061 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
3063 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
3064 SDNode OpNode, SizeItins itins> {
3065 let Predicates = [HasAVX, NoVLX] in {
3066 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
3067 VR128, v4f32, f128mem, loadv4f32,
3068 SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
3069 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
3070 VR128, v2f64, f128mem, loadv2f64,
3071 SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
3073 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
3074 OpNode, VR256, v8f32, f256mem, loadv8f32,
3075 SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
3076 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
3077 OpNode, VR256, v4f64, f256mem, loadv4f64,
3078 SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
3081 let Constraints = "$src1 = $dst" in {
3082 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
3083 v4f32, f128mem, memopv4f32, SSEPackedSingle,
3085 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
3086 v2f64, f128mem, memopv2f64, SSEPackedDouble,
3091 multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3093 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3094 OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
3095 XS, VEX_4V, VEX_LIG;
3096 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3097 OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
3098 XD, VEX_4V, VEX_LIG;
3100 let Constraints = "$src1 = $dst" in {
3101 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3102 OpNode, FR32, f32mem, SSEPackedSingle,
3104 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3105 OpNode, FR64, f64mem, SSEPackedDouble,
3110 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
3111 SDPatternOperator IntSS,
3112 SDPatternOperator IntSD,
3114 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
3115 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
3116 SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
3117 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
3118 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
3119 SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
3121 let Constraints = "$src1 = $dst" in {
3122 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
3123 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
3124 SSEPackedSingle, itins.s>, XS;
3125 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
3126 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
3127 SSEPackedDouble, itins.d>, XD;
3131 // Binary Arithmetic instructions
3132 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
3133 basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
3134 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag,
3136 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
3137 basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
3138 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag,
3140 let isCommutable = 0 in {
3141 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
3142 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
3143 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag,
3145 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
3146 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
3147 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag,
3149 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
3150 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
3151 basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss,
3152 int_x86_sse2_max_sd, SSE_ALU_ITINS_S>;
3153 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
3154 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
3155 basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss,
3156 int_x86_sse2_min_sd, SSE_ALU_ITINS_S>;
3159 let isCodeGenOnly = 1 in {
3160 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
3161 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
3162 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
3163 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
3166 // Patterns used to select SSE scalar fp arithmetic instructions from
3169 // (1) a scalar fp operation followed by a blend
3171 // The effect is that the backend no longer emits unnecessary vector
3172 // insert instructions immediately after SSE scalar fp instructions
3173 // like addss or mulss.
3175 // For example, given the following code:
3176 // __m128 foo(__m128 A, __m128 B) {
3181 // Previously we generated:
3182 // addss %xmm0, %xmm1
3183 // movss %xmm1, %xmm0
3186 // addss %xmm1, %xmm0
3188 // (2) a vector packed single/double fp operation followed by a vector insert
3190 // The effect is that the backend converts the packed fp instruction
3191 // followed by a vector insert into a single SSE scalar fp instruction.
3193 // For example, given the following code:
3194 // __m128 foo(__m128 A, __m128 B) {
3195 // __m128 C = A + B;
3196 // return (__m128) {c[0], a[1], a[2], a[3]};
3199 // Previously we generated:
3200 // addps %xmm0, %xmm1
3201 // movss %xmm1, %xmm0
3204 // addss %xmm1, %xmm0
3206 // TODO: Some canonicalization in lowering would simplify the number of
3207 // patterns we have to try to match.
3208 multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
3209 let Predicates = [UseSSE1] in {
3210 // extracted scalar math op with insert via movss
3211 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3212 (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
3214 (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
3215 (COPY_TO_REGCLASS FR32:$src, VR128))>;
3217 // vector math op with insert via movss
3218 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3219 (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3220 (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
3223 // With SSE 4.1, blendi is preferred to movsd, so match that too.
3224 let Predicates = [UseSSE41] in {
3225 // extracted scalar math op with insert via blend
3226 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3227 (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
3228 FR32:$src))), (i8 1))),
3229 (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
3230 (COPY_TO_REGCLASS FR32:$src, VR128))>;
3232 // vector math op with insert via blend
3233 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
3234 (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
3235 (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
3239 // Repeat everything for AVX.
3240 let Predicates = [UseAVX] in {
3241 // extracted scalar math op with insert via movss
3242 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3243 (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
3245 (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
3246 (COPY_TO_REGCLASS FR32:$src, VR128))>;
3248 // extracted scalar math op with insert via blend
3249 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3250 (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
3251 FR32:$src))), (i8 1))),
3252 (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
3253 (COPY_TO_REGCLASS FR32:$src, VR128))>;
3255 // vector math op with insert via movss
3256 def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3257 (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3258 (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
3260 // vector math op with insert via blend
3261 def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
3262 (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
3263 (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
3267 defm : scalar_math_f32_patterns<fadd, "ADD">;
3268 defm : scalar_math_f32_patterns<fsub, "SUB">;
3269 defm : scalar_math_f32_patterns<fmul, "MUL">;
3270 defm : scalar_math_f32_patterns<fdiv, "DIV">;
3272 multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
3273 let Predicates = [UseSSE2] in {
3274 // extracted scalar math op with insert via movsd
3275 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3276 (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
3278 (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
3279 (COPY_TO_REGCLASS FR64:$src, VR128))>;
3281 // vector math op with insert via movsd
3282 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3283 (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3284 (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3287 // With SSE 4.1, blendi is preferred to movsd, so match those too.
3288 let Predicates = [UseSSE41] in {
3289 // extracted scalar math op with insert via blend
3290 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3291 (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
3292 FR64:$src))), (i8 1))),
3293 (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
3294 (COPY_TO_REGCLASS FR64:$src, VR128))>;
3296 // vector math op with insert via blend
3297 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
3298 (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
3299 (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3302 // Repeat everything for AVX.
3303 let Predicates = [UseAVX] in {
3304 // extracted scalar math op with insert via movsd
3305 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3306 (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
3308 (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
3309 (COPY_TO_REGCLASS FR64:$src, VR128))>;
3311 // extracted scalar math op with insert via blend
3312 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
3313 (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
3314 FR64:$src))), (i8 1))),
3315 (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
3316 (COPY_TO_REGCLASS FR64:$src, VR128))>;
3318 // vector math op with insert via movsd
3319 def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3320 (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3321 (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3323 // vector math op with insert via blend
3324 def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
3325 (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
3326 (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
3330 defm : scalar_math_f64_patterns<fadd, "ADD">;
3331 defm : scalar_math_f64_patterns<fsub, "SUB">;
3332 defm : scalar_math_f64_patterns<fmul, "MUL">;
3333 defm : scalar_math_f64_patterns<fdiv, "DIV">;
3337 /// In addition, we also have a special variant of the scalar form here to
3338 /// represent the associated intrinsic operation. This form is unlike the
3339 /// plain scalar form, in that it takes an entire vector (instead of a
3340 /// scalar) and leaves the top elements undefined.
3342 /// And, we have a special variant form for a full-vector intrinsic form.
3344 let Sched = WriteFSqrt in {
3345 def SSE_SQRTPS : OpndItins<
3346 IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
3349 def SSE_SQRTSS : OpndItins<
3350 IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
3353 def SSE_SQRTPD : OpndItins<
3354 IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
3357 def SSE_SQRTSD : OpndItins<
3358 IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
3362 let Sched = WriteFRsqrt in {
3363 def SSE_RSQRTPS : OpndItins<
3364 IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
3367 def SSE_RSQRTSS : OpndItins<
3368 IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
3372 let Sched = WriteFRcp in {
3373 def SSE_RCPP : OpndItins<
3374 IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
3377 def SSE_RCPS : OpndItins<
3378 IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
3382 /// sse_fp_unop_s - SSE1 unops in scalar form
3383 /// For the non-AVX defs, we need $src1 to be tied to $dst because
3384 /// the HW instructions are 2 operand / destructive.
3385 multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
3386 ValueType vt, ValueType ScalarVT,
3387 X86MemOperand x86memop,
3389 SDNode OpNode, Domain d, OpndItins itins,
3390 Predicate target, string Suffix> {
3391 let hasSideEffects = 0 in {
3392 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
3393 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
3394 [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
3397 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
3398 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
3399 [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
3400 Sched<[itins.Sched.Folded, ReadAfterLd]>,
3401 Requires<[target, OptForSize]>;
3403 let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
3404 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3405 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3406 []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3408 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
3409 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3410 []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3414 let Predicates = [target] in {
3415 // These are unary operations, but they are modeled as having 2 source operands
3416 // because the high elements of the destination are unchanged in SSE.
3417 def : Pat<(Intr VR128:$src),
3418 (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
3420 // We don't want to fold scalar loads into these instructions unless
3421 // optimizing for size. This is because the folded instruction will have a
3422 // partial register update, while the unfolded sequence will not, e.g.
3424 // rcpss %xmm0, %xmm0
3425 // which has a clobber before the rcp, vs.
3427 let Predicates = [target, OptForSize] in {
3428 def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
3429 (!cast<Instruction>(NAME#Suffix##m_Int)
3430 (vt (IMPLICIT_DEF)), addr:$src2)>;
3434 multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
3435 ValueType vt, ValueType ScalarVT,
3436 X86MemOperand x86memop,
3437 Intrinsic Intr, SDNode OpNode, Domain d,
3438 OpndItins itins, string Suffix> {
3439 let hasSideEffects = 0 in {
3440 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
3441 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3442 [], itins.rr, d>, Sched<[itins.Sched]>;
3444 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3445 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3446 [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3447 let isCodeGenOnly = 1 in {
3448 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
3449 (ins VR128:$src1, VR128:$src2),
3450 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3451 []>, Sched<[itins.Sched.Folded]>;
3453 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
3454 (ins VR128:$src1, x86memop:$src2),
3455 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3456 []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3460 // We don't want to fold scalar loads into these instructions unless
3461 // optimizing for size. This is because the folded instruction will have a
3462 // partial register update, while the unfolded sequence will not, e.g.
3463 // vmovss mem, %xmm0
3464 // vrcpss %xmm0, %xmm0, %xmm0
3465 // which has a clobber before the rcp, vs.
3466 // vrcpss mem, %xmm0, %xmm0
3467 // TODO: In theory, we could fold the load, and avoid the stall caused by
3468 // the partial register store, either in ExeDepFix or with smarter RA.
3469 let Predicates = [UseAVX] in {
3470 def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r)
3471 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
3473 let Predicates = [HasAVX] in {
3474 def : Pat<(Intr VR128:$src),
3475 (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
3478 let Predicates = [HasAVX, OptForSize] in {
3479 def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
3480 (!cast<Instruction>("V"#NAME#Suffix##m_Int)
3481 (vt (IMPLICIT_DEF)), addr:$src2)>;
3483 let Predicates = [UseAVX, OptForSize] in {
3484 def : Pat<(ScalarVT (OpNode (load addr:$src))),
3485 (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
3490 /// sse1_fp_unop_p - SSE1 unops in packed form.
3491 multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
3492 OpndItins itins, list<Predicate> prds> {
3493 let Predicates = prds in {
3494 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3495 !strconcat("v", OpcodeStr,
3496 "ps\t{$src, $dst|$dst, $src}"),
3497 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
3498 itins.rr>, VEX, Sched<[itins.Sched]>;
3499 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3500 !strconcat("v", OpcodeStr,
3501 "ps\t{$src, $dst|$dst, $src}"),
3502 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
3503 itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3504 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3505 !strconcat("v", OpcodeStr,
3506 "ps\t{$src, $dst|$dst, $src}"),
3507 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
3508 itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3509 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3510 !strconcat("v", OpcodeStr,
3511 "ps\t{$src, $dst|$dst, $src}"),
3512 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
3513 itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3516 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3517 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3518 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
3519 Sched<[itins.Sched]>;
3520 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3521 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3522 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
3523 Sched<[itins.Sched.Folded]>;
3526 /// sse2_fp_unop_p - SSE2 unops in vector forms.
3527 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
3528 SDNode OpNode, OpndItins itins> {
3529 let Predicates = [HasAVX] in {
3530 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3531 !strconcat("v", OpcodeStr,
3532 "pd\t{$src, $dst|$dst, $src}"),
3533 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
3534 itins.rr>, VEX, Sched<[itins.Sched]>;
3535 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3536 !strconcat("v", OpcodeStr,
3537 "pd\t{$src, $dst|$dst, $src}"),
3538 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
3539 itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3540 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3541 !strconcat("v", OpcodeStr,
3542 "pd\t{$src, $dst|$dst, $src}"),
3543 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
3544 itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3545 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3546 !strconcat("v", OpcodeStr,
3547 "pd\t{$src, $dst|$dst, $src}"),
3548 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
3549 itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3552 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3553 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3554 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
3555 Sched<[itins.Sched]>;
3556 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3557 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3558 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
3559 Sched<[itins.Sched.Folded]>;
3562 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3564 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
3565 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
3566 SSEPackedSingle, itins, UseSSE1, "SS">, XS;
3567 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
3569 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
3570 SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
3573 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3575 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
3576 !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
3577 OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
3578 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
3580 !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
3581 OpNode, SSEPackedDouble, itins, "SD">,
3582 XD, VEX_4V, VEX_LIG;
3586 defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
3587 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
3588 sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
3589 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
3591 // Reciprocal approximations. Note that these typically require refinement
3592 // in order to obtain suitable precision.
3593 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
3594 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
3595 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
3596 sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
3598 // There is no f64 version of the reciprocal approximation instructions.
3600 // TODO: We should add *scalar* op patterns for these just like we have for
3601 // the binops above. If the binop and unop patterns could all be unified
3602 // that would be even better.
3604 multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
3605 SDNode Move, ValueType VT,
3606 Predicate BasePredicate> {
3607 let Predicates = [BasePredicate] in {
3608 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3609 (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3612 // With SSE 4.1, blendi is preferred to movs*, so match that too.
3613 let Predicates = [UseSSE41] in {
3614 def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
3615 (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3618 // Repeat for AVX versions of the instructions.
3619 let Predicates = [HasAVX] in {
3620 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3621 (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3623 def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
3624 (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3628 defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3630 defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3632 defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
3634 defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
3638 //===----------------------------------------------------------------------===//
3639 // SSE 1 & 2 - Non-temporal stores
3640 //===----------------------------------------------------------------------===//
3642 let AddedComplexity = 400 in { // Prefer non-temporal versions
3643 let SchedRW = [WriteStore] in {
3644 let Predicates = [HasAVX, NoVLX] in {
3645 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3646 (ins f128mem:$dst, VR128:$src),
3647 "movntps\t{$src, $dst|$dst, $src}",
3648 [(alignednontemporalstore (v4f32 VR128:$src),
3650 IIC_SSE_MOVNT>, VEX;
3651 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3652 (ins f128mem:$dst, VR128:$src),
3653 "movntpd\t{$src, $dst|$dst, $src}",
3654 [(alignednontemporalstore (v2f64 VR128:$src),
3656 IIC_SSE_MOVNT>, VEX;
3658 let ExeDomain = SSEPackedInt in
3659 def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
3660 (ins f128mem:$dst, VR128:$src),
3661 "movntdq\t{$src, $dst|$dst, $src}",
3662 [(alignednontemporalstore (v2i64 VR128:$src),
3664 IIC_SSE_MOVNT>, VEX;
3666 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3667 (ins f256mem:$dst, VR256:$src),
3668 "movntps\t{$src, $dst|$dst, $src}",
3669 [(alignednontemporalstore (v8f32 VR256:$src),
3671 IIC_SSE_MOVNT>, VEX, VEX_L;
3672 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3673 (ins f256mem:$dst, VR256:$src),
3674 "movntpd\t{$src, $dst|$dst, $src}",
3675 [(alignednontemporalstore (v4f64 VR256:$src),
3677 IIC_SSE_MOVNT>, VEX, VEX_L;
3678 let ExeDomain = SSEPackedInt in
3679 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3680 (ins f256mem:$dst, VR256:$src),
3681 "movntdq\t{$src, $dst|$dst, $src}",
3682 [(alignednontemporalstore (v4i64 VR256:$src),
3684 IIC_SSE_MOVNT>, VEX, VEX_L;
3687 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3688 "movntps\t{$src, $dst|$dst, $src}",
3689 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
3691 def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3692 "movntpd\t{$src, $dst|$dst, $src}",
3693 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
3696 let ExeDomain = SSEPackedInt in
3697 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3698 "movntdq\t{$src, $dst|$dst, $src}",
3699 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
3702 // There is no AVX form for instructions below this point
3703 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3704 "movnti{l}\t{$src, $dst|$dst, $src}",
3705 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
3707 PS, Requires<[HasSSE2]>;
3708 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3709 "movnti{q}\t{$src, $dst|$dst, $src}",
3710 [(nontemporalstore (i64 GR64:$src), addr:$dst)],
3712 PS, Requires<[HasSSE2]>;
3713 } // SchedRW = [WriteStore]
3715 let Predicates = [HasAVX, NoVLX] in {
3716 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3717 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3718 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3719 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3720 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3721 (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3723 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3724 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3725 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3726 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3727 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3728 (VMOVNTDQmr addr:$dst, VR128:$src)>;
3731 let Predicates = [UseSSE2] in {
3732 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3733 (MOVNTDQmr addr:$dst, VR128:$src)>;
3734 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3735 (MOVNTDQmr addr:$dst, VR128:$src)>;
3736 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3737 (MOVNTDQmr addr:$dst, VR128:$src)>;
3740 } // AddedComplexity
3742 //===----------------------------------------------------------------------===//
3743 // SSE 1 & 2 - Prefetch and memory fence
3744 //===----------------------------------------------------------------------===//
3746 // Prefetch intrinsic.
3747 let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
3748 def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3749 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
3750 IIC_SSE_PREFETCH>, TB;
3751 def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3752 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
3753 IIC_SSE_PREFETCH>, TB;
3754 def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3755 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
3756 IIC_SSE_PREFETCH>, TB;
3757 def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3758 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
3759 IIC_SSE_PREFETCH>, TB;
3762 // FIXME: How should flush instruction be modeled?
3763 let SchedRW = [WriteLoad] in {
3765 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3766 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
3767 IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
3770 let SchedRW = [WriteNop] in {
3771 // Pause. This "instruction" is encoded as "rep; nop", so even though it
3772 // was introduced with SSE2, it's backward compatible.
3773 def PAUSE : I<0x90, RawFrm, (outs), (ins),
3774 "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
3775 OBXS, Requires<[HasSSE2]>;
3778 let SchedRW = [WriteFence] in {
3779 // Load, store, and memory fence
3780 // TODO: As with mfence, we may want to ease the availablity of sfence/lfence
3781 // to include any 64-bit target.
3782 def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
3783 "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
3784 PS, Requires<[HasSSE1]>;
3785 def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
3786 "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
3787 TB, Requires<[HasSSE2]>;
3788 def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
3789 "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
3790 TB, Requires<[HasMFence]>;
3793 def : Pat<(X86MFence), (MFENCE)>;
3795 //===----------------------------------------------------------------------===//
3796 // SSE 1 & 2 - Load/Store XCSR register
3797 //===----------------------------------------------------------------------===//
3799 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3800 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3801 IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
3802 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3803 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3804 IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
3806 let Predicates = [UseSSE1] in {
3807 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3808 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3809 IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
3810 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3811 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3812 IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
3815 //===---------------------------------------------------------------------===//
3816 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3817 //===---------------------------------------------------------------------===//
3819 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3821 let hasSideEffects = 0, SchedRW = [WriteMove] in {
3822 def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3823 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3825 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3826 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3828 def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3829 "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3831 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3832 "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3837 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
3838 SchedRW = [WriteMove] in {
3839 def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3840 "movdqa\t{$src, $dst|$dst, $src}", [],
3843 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3844 "movdqa\t{$src, $dst|$dst, $src}", [],
3845 IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
3846 def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3847 "movdqu\t{$src, $dst|$dst, $src}", [],
3850 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3851 "movdqu\t{$src, $dst|$dst, $src}", [],
3852 IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
3855 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3856 hasSideEffects = 0, SchedRW = [WriteLoad] in {
3857 def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3858 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3860 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3861 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3863 let Predicates = [HasAVX] in {
3864 def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3865 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3867 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3868 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3873 let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
3874 def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
3875 (ins i128mem:$dst, VR128:$src),
3876 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3878 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3879 (ins i256mem:$dst, VR256:$src),
3880 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3882 let Predicates = [HasAVX] in {
3883 def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3884 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3886 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3887 "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3892 let SchedRW = [WriteMove] in {
3893 let hasSideEffects = 0 in {
3894 def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3895 "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
3897 def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3898 "movdqu\t{$src, $dst|$dst, $src}",
3899 [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3903 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3904 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3905 "movdqa\t{$src, $dst|$dst, $src}", [],
3908 def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3909 "movdqu\t{$src, $dst|$dst, $src}",
3910 [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3914 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3915 hasSideEffects = 0, SchedRW = [WriteLoad] in {
3916 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3917 "movdqa\t{$src, $dst|$dst, $src}",
3918 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
3920 def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3921 "movdqu\t{$src, $dst|$dst, $src}",
3922 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
3924 XS, Requires<[UseSSE2]>;
3927 let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
3928 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3929 "movdqa\t{$src, $dst|$dst, $src}",
3930 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
3932 def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3933 "movdqu\t{$src, $dst|$dst, $src}",
3934 [/*(store (v2i64 VR128:$src), addr:$dst)*/],
3936 XS, Requires<[UseSSE2]>;
3939 } // ExeDomain = SSEPackedInt
3941 // Aliases to help the assembler pick two byte VEX encodings by swapping the
3942 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
3943 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
3944 (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
3945 def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
3946 (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
3947 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
3948 (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
3949 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
3950 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
3952 //===---------------------------------------------------------------------===//
3953 // SSE2 - Packed Integer Arithmetic Instructions
3954 //===---------------------------------------------------------------------===//
3956 let Sched = WriteVecIMul in
3957 def SSE_PMADD : OpndItins<
3958 IIC_SSE_PMADD, IIC_SSE_PMADD
3961 let ExeDomain = SSEPackedInt in { // SSE integer instructions
3963 /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3964 multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3965 ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3966 PatFrag memop_frag, X86MemOperand x86memop,
3967 OpndItins itins, bit Is2Addr = 1> {
3968 let isCommutable = 1 in
3969 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3970 (ins RC:$src1, RC:$src2),
3972 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3973 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3974 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3975 Sched<[itins.Sched]>;
3976 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3977 (ins RC:$src1, x86memop:$src2),
3979 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3980 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3981 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3982 (bitconvert (memop_frag addr:$src2)))))]>,
3983 Sched<[itins.Sched.Folded, ReadAfterLd]>;
3985 } // ExeDomain = SSEPackedInt
3987 defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3988 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
3989 defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3990 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
3991 defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3992 SSE_INTALU_ITINS_P, 1, NoVLX>;
3993 defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3994 SSE_INTALUQ_ITINS_P, 1, NoVLX>;
3995 defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
3996 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
3997 defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
3998 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
3999 defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
4000 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4001 defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
4002 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4003 defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
4004 SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
4005 defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
4006 SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
4007 defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
4008 SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
4009 defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
4010 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4011 defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
4012 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4013 defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
4014 SSE_INTALU_ITINS_P, 0, NoVLX>;
4015 defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
4016 SSE_INTALUQ_ITINS_P, 0, NoVLX>;
4017 defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
4018 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4019 defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
4020 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4021 defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
4022 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4023 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
4024 SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
4025 defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
4026 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4027 defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
4028 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4029 defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
4030 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4031 defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
4032 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4033 defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
4034 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4035 defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
4036 SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
4038 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4039 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
4040 loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V;
4042 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4043 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
4044 VR256, loadv4i64, i256mem, SSE_PMADD,
4046 let Constraints = "$src1 = $dst" in
4047 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
4048 memopv2i64, i128mem, SSE_PMADD>;
4050 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4051 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
4052 loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
4054 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4055 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
4056 loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
4058 let Constraints = "$src1 = $dst" in
4059 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
4060 memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
4062 let Predicates = [HasAVX, NoVLX] in
4063 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
4064 loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
4066 let Predicates = [HasAVX2, NoVLX] in
4067 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
4068 VR256, loadv4i64, i256mem,
4069 SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L;
4070 let Constraints = "$src1 = $dst" in
4071 defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
4072 memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
4074 //===---------------------------------------------------------------------===//
4075 // SSE2 - Packed Integer Logical Instructions
4076 //===---------------------------------------------------------------------===//
4078 multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
4079 string OpcodeStr, SDNode OpNode,
4080 SDNode OpNode2, RegisterClass RC,
4081 ValueType DstVT, ValueType SrcVT,
4082 PatFrag ld_frag, bit Is2Addr = 1> {
4083 // src2 is always 128-bit
4084 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
4085 (ins RC:$src1, VR128:$src2),
4087 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4088 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4089 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
4090 SSE_INTSHIFT_ITINS_P.rr>, Sched<[WriteVecShift]>;
4091 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
4092 (ins RC:$src1, i128mem:$src2),
4094 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4095 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4096 [(set RC:$dst, (DstVT (OpNode RC:$src1,
4097 (SrcVT (bitconvert (ld_frag addr:$src2))))))],
4098 SSE_INTSHIFT_ITINS_P.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
4099 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
4100 (ins RC:$src1, u8imm:$src2),
4102 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4103 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4104 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))],
4105 SSE_INTSHIFT_ITINS_P.ri>, Sched<[WriteVecShift]>;
4108 multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
4109 string OpcodeStr, SDNode OpNode,
4110 SDNode OpNode2, ValueType DstVT128,
4111 ValueType DstVT256, ValueType SrcVT,
4113 let Predicates = [HasAVX, prd] in
4114 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
4115 OpNode, OpNode2, VR128, DstVT128, SrcVT,
4116 loadv2i64, 0>, VEX_4V;
4117 let Predicates = [HasAVX2, prd] in
4118 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
4119 OpNode, OpNode2, VR256, DstVT256, SrcVT,
4120 loadv2i64, 0>, VEX_4V, VEX_L;
4121 let Constraints = "$src1 = $dst" in
4122 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
4123 VR128, DstVT128, SrcVT, memopv2i64>;
4126 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
4127 SDNode OpNode, RegisterClass RC, ValueType VT,
4129 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
4131 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4132 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4133 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))],
4134 IIC_SSE_INTSHDQ_P_RI>, Sched<[WriteVecShift]>;
4137 multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
4139 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4140 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
4141 VR128, v16i8, 0>, VEX_4V;
4142 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4143 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
4144 VR256, v32i8, 0>, VEX_4V, VEX_L;
4145 let Constraints = "$src1 = $dst" in
4146 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
4149 let ExeDomain = SSEPackedInt in {
4150 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
4151 v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
4152 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
4153 v4i32, v8i32, v4i32, NoVLX>;
4154 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
4155 v2i64, v4i64, v2i64, NoVLX>;
4157 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
4158 v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
4159 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
4160 v4i32, v8i32, v4i32, NoVLX>;
4161 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
4162 v2i64, v4i64, v2i64, NoVLX>;
4164 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
4165 v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
4166 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
4167 v4i32, v8i32, v4i32, NoVLX>;
4169 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq>;
4170 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq>;
4171 // PSRADQri doesn't exist in SSE[1-3].
4172 } // ExeDomain = SSEPackedInt
4174 //===---------------------------------------------------------------------===//
4175 // SSE2 - Packed Integer Comparison Instructions
4176 //===---------------------------------------------------------------------===//
4178 defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
4179 SSE_INTALU_ITINS_P, 1, TruePredicate>;
4180 defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
4181 SSE_INTALU_ITINS_P, 1, TruePredicate>;
4182 defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
4183 SSE_INTALU_ITINS_P, 1, TruePredicate>;
4184 defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
4185 SSE_INTALU_ITINS_P, 0, TruePredicate>;
4186 defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
4187 SSE_INTALU_ITINS_P, 0, TruePredicate>;
4188 defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
4189 SSE_INTALU_ITINS_P, 0, TruePredicate>;
4191 //===---------------------------------------------------------------------===//
4192 // SSE2 - Packed Integer Shuffle Instructions
4193 //===---------------------------------------------------------------------===//
4195 let ExeDomain = SSEPackedInt in {
4196 multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
4197 SDNode OpNode, Predicate prd> {
4198 let Predicates = [HasAVX, prd] in {
4199 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
4200 (ins VR128:$src1, u8imm:$src2),
4201 !strconcat("v", OpcodeStr,
4202 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4204 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4205 IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
4206 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
4207 (ins i128mem:$src1, u8imm:$src2),
4208 !strconcat("v", OpcodeStr,
4209 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4211 (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
4212 (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
4213 Sched<[WriteShuffleLd]>;
4216 let Predicates = [HasAVX2, prd] in {
4217 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
4218 (ins VR256:$src1, u8imm:$src2),
4219 !strconcat("v", OpcodeStr,
4220 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4222 (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
4223 IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
4224 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
4225 (ins i256mem:$src1, u8imm:$src2),
4226 !strconcat("v", OpcodeStr,
4227 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4229 (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
4230 (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
4231 Sched<[WriteShuffleLd]>;
4234 let Predicates = [UseSSE2] in {
4235 def ri : Ii8<0x70, MRMSrcReg,
4236 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
4237 !strconcat(OpcodeStr,
4238 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4240 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4241 IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
4242 def mi : Ii8<0x70, MRMSrcMem,
4243 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
4244 !strconcat(OpcodeStr,
4245 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4247 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
4248 (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
4249 Sched<[WriteShuffleLd, ReadAfterLd]>;
4252 } // ExeDomain = SSEPackedInt
4254 defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, NoVLX>, PD;
4255 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
4256 NoVLX_Or_NoBWI>, XS;
4257 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
4258 NoVLX_Or_NoBWI>, XD;
4260 let Predicates = [HasAVX] in {
4261 def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
4262 (VPSHUFDmi addr:$src1, imm:$imm)>;
4263 def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4264 (VPSHUFDri VR128:$src1, imm:$imm)>;
4267 let Predicates = [UseSSE2] in {
4268 def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
4269 (PSHUFDmi addr:$src1, imm:$imm)>;
4270 def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4271 (PSHUFDri VR128:$src1, imm:$imm)>;
4274 //===---------------------------------------------------------------------===//
4275 // Packed Integer Pack Instructions (SSE & AVX)
4276 //===---------------------------------------------------------------------===//
4278 let ExeDomain = SSEPackedInt in {
4279 multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
4280 ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
4282 def rr : PDI<opc, MRMSrcReg,
4283 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4285 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4286 !strconcat(OpcodeStr,
4287 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4289 (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
4290 Sched<[WriteShuffle]>;
4291 def rm : PDI<opc, MRMSrcMem,
4292 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4294 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4295 !strconcat(OpcodeStr,
4296 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4298 (OutVT (OpNode (ArgVT VR128:$src1),
4299 (bitconvert (ld_frag addr:$src2)))))]>,
4300 Sched<[WriteShuffleLd, ReadAfterLd]>;
4303 multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
4304 ValueType ArgVT, SDNode OpNode> {
4305 def Yrr : PDI<opc, MRMSrcReg,
4306 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4307 !strconcat(OpcodeStr,
4308 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4310 (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
4311 Sched<[WriteShuffle]>;
4312 def Yrm : PDI<opc, MRMSrcMem,
4313 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4314 !strconcat(OpcodeStr,
4315 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4317 (OutVT (OpNode (ArgVT VR256:$src1),
4318 (bitconvert (loadv4i64 addr:$src2)))))]>,
4319 Sched<[WriteShuffleLd, ReadAfterLd]>;
4322 multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
4323 ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
4325 def rr : SS48I<opc, MRMSrcReg,
4326 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4328 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4329 !strconcat(OpcodeStr,
4330 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4332 (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
4333 Sched<[WriteShuffle]>;
4334 def rm : SS48I<opc, MRMSrcMem,
4335 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4337 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4338 !strconcat(OpcodeStr,
4339 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4341 (OutVT (OpNode (ArgVT VR128:$src1),
4342 (bitconvert (ld_frag addr:$src2)))))]>,
4343 Sched<[WriteShuffleLd, ReadAfterLd]>;
4346 multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
4347 ValueType ArgVT, SDNode OpNode> {
4348 def Yrr : SS48I<opc, MRMSrcReg,
4349 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4350 !strconcat(OpcodeStr,
4351 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4353 (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
4354 Sched<[WriteShuffle]>;
4355 def Yrm : SS48I<opc, MRMSrcMem,
4356 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4357 !strconcat(OpcodeStr,
4358 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4360 (OutVT (OpNode (ArgVT VR256:$src1),
4361 (bitconvert (loadv4i64 addr:$src2)))))]>,
4362 Sched<[WriteShuffleLd, ReadAfterLd]>;
4365 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4366 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
4367 loadv2i64, 0>, VEX_4V;
4368 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
4369 loadv2i64, 0>, VEX_4V;
4371 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
4372 loadv2i64, 0>, VEX_4V;
4373 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
4374 loadv2i64, 0>, VEX_4V;
4377 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4378 defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
4380 defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
4383 defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
4385 defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
4389 let Constraints = "$src1 = $dst" in {
4390 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
4392 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
4395 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
4398 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
4401 } // ExeDomain = SSEPackedInt
4403 //===---------------------------------------------------------------------===//
4404 // SSE2 - Packed Integer Unpack Instructions
4405 //===---------------------------------------------------------------------===//
4407 let ExeDomain = SSEPackedInt in {
4408 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
4409 SDNode OpNode, PatFrag ld_frag, bit Is2Addr = 1> {
4410 def rr : PDI<opc, MRMSrcReg,
4411 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4413 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4414 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4415 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
4416 IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
4417 def rm : PDI<opc, MRMSrcMem,
4418 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4420 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4421 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4422 [(set VR128:$dst, (vt (OpNode VR128:$src1,
4423 (bitconvert (ld_frag addr:$src2)))))],
4425 Sched<[WriteShuffleLd, ReadAfterLd]>;
4428 multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
4430 def Yrr : PDI<opc, MRMSrcReg,
4431 (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4432 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4433 [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
4434 Sched<[WriteShuffle]>;
4435 def Yrm : PDI<opc, MRMSrcMem,
4436 (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4437 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4438 [(set VR256:$dst, (vt (OpNode VR256:$src1,
4439 (bitconvert (loadv4i64 addr:$src2)))))]>,
4440 Sched<[WriteShuffleLd, ReadAfterLd]>;
4444 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4445 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
4446 loadv2i64, 0>, VEX_4V;
4447 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
4448 loadv2i64, 0>, VEX_4V;
4449 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
4450 loadv2i64, 0>, VEX_4V;
4451 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
4452 loadv2i64, 0>, VEX_4V;
4454 let Predicates = [HasAVX, NoVLX] in {
4455 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
4456 loadv2i64, 0>, VEX_4V;
4457 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
4458 loadv2i64, 0>, VEX_4V;
4459 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
4460 loadv2i64, 0>, VEX_4V;
4461 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
4462 loadv2i64, 0>, VEX_4V;
4465 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4466 defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
4468 defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
4470 defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
4472 defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
4475 let Predicates = [HasAVX2, NoVLX] in {
4476 defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
4478 defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
4480 defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
4482 defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
4486 let Constraints = "$src1 = $dst" in {
4487 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
4489 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
4491 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
4493 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
4496 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
4498 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
4500 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
4502 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
4505 } // ExeDomain = SSEPackedInt
4507 //===---------------------------------------------------------------------===//
4508 // SSE2 - Packed Integer Extract and Insert
4509 //===---------------------------------------------------------------------===//
4511 let ExeDomain = SSEPackedInt in {
4512 multiclass sse2_pinsrw<bit Is2Addr = 1> {
4513 def rri : Ii8<0xC4, MRMSrcReg,
4514 (outs VR128:$dst), (ins VR128:$src1,
4515 GR32orGR64:$src2, u8imm:$src3),
4517 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4518 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4520 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
4521 IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
4522 def rmi : Ii8<0xC4, MRMSrcMem,
4523 (outs VR128:$dst), (ins VR128:$src1,
4524 i16mem:$src2, u8imm:$src3),
4526 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4527 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4529 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4530 imm:$src3))], IIC_SSE_PINSRW>,
4531 Sched<[WriteShuffleLd, ReadAfterLd]>;
4535 let Predicates = [HasAVX, NoBWI] in
4536 def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
4537 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4538 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4539 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4540 imm:$src2))]>, PD, VEX,
4541 Sched<[WriteShuffle]>;
4542 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
4543 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4544 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4545 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4546 imm:$src2))], IIC_SSE_PEXTRW>,
4547 Sched<[WriteShuffleLd, ReadAfterLd]>;
4550 let Predicates = [HasAVX, NoBWI] in
4551 defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
4553 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4554 defm PINSRW : sse2_pinsrw, PD;
4556 } // ExeDomain = SSEPackedInt
4558 //===---------------------------------------------------------------------===//
4559 // SSE2 - Packed Mask Creation
4560 //===---------------------------------------------------------------------===//
4562 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
4564 def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4566 "pmovmskb\t{$src, $dst|$dst, $src}",
4567 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
4568 IIC_SSE_MOVMSK>, VEX;
4570 let Predicates = [HasAVX2] in {
4571 def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4573 "pmovmskb\t{$src, $dst|$dst, $src}",
4574 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
4578 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4579 "pmovmskb\t{$src, $dst|$dst, $src}",
4580 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
4583 } // ExeDomain = SSEPackedInt
4585 //===---------------------------------------------------------------------===//
4586 // SSE2 - Conditional Store
4587 //===---------------------------------------------------------------------===//
4589 let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
4591 let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4592 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4593 (ins VR128:$src, VR128:$mask),
4594 "maskmovdqu\t{$mask, $src|$src, $mask}",
4595 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4596 IIC_SSE_MASKMOV>, VEX;
4597 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4598 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4599 (ins VR128:$src, VR128:$mask),
4600 "maskmovdqu\t{$mask, $src|$src, $mask}",
4601 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4602 IIC_SSE_MASKMOV>, VEX;
4604 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4605 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4606 "maskmovdqu\t{$mask, $src|$src, $mask}",
4607 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4609 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4610 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4611 "maskmovdqu\t{$mask, $src|$src, $mask}",
4612 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4615 } // ExeDomain = SSEPackedInt
4617 //===---------------------------------------------------------------------===//
4618 // SSE2 - Move Doubleword/Quadword
4619 //===---------------------------------------------------------------------===//
4621 //===---------------------------------------------------------------------===//
4622 // Move Int Doubleword to Packed Double Int
4624 let ExeDomain = SSEPackedInt in {
4625 def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4626 "movd\t{$src, $dst|$dst, $src}",
4628 (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4629 VEX, Sched<[WriteMove]>;
4630 def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4631 "movd\t{$src, $dst|$dst, $src}",
4633 (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4635 VEX, Sched<[WriteLoad]>;
4636 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4637 "movq\t{$src, $dst|$dst, $src}",
4639 (v2i64 (scalar_to_vector GR64:$src)))],
4640 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4641 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4642 def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4643 "movq\t{$src, $dst|$dst, $src}",
4644 [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
4645 let isCodeGenOnly = 1 in
4646 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4647 "movq\t{$src, $dst|$dst, $src}",
4648 [(set FR64:$dst, (bitconvert GR64:$src))],
4649 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4651 def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4652 "movd\t{$src, $dst|$dst, $src}",
4654 (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4656 def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4657 "movd\t{$src, $dst|$dst, $src}",
4659 (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4660 IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4661 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4662 "mov{d|q}\t{$src, $dst|$dst, $src}",
4664 (v2i64 (scalar_to_vector GR64:$src)))],
4665 IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4666 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4667 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4668 "mov{d|q}\t{$src, $dst|$dst, $src}",
4669 [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4670 let isCodeGenOnly = 1 in
4671 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4672 "mov{d|q}\t{$src, $dst|$dst, $src}",
4673 [(set FR64:$dst, (bitconvert GR64:$src))],
4674 IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4675 } // ExeDomain = SSEPackedInt
4677 //===---------------------------------------------------------------------===//
4678 // Move Int Doubleword to Single Scalar
4680 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4681 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4682 "movd\t{$src, $dst|$dst, $src}",
4683 [(set FR32:$dst, (bitconvert GR32:$src))],
4684 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4686 def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4687 "movd\t{$src, $dst|$dst, $src}",
4688 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4690 VEX, Sched<[WriteLoad]>;
4691 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4692 "movd\t{$src, $dst|$dst, $src}",
4693 [(set FR32:$dst, (bitconvert GR32:$src))],
4694 IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4696 def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4697 "movd\t{$src, $dst|$dst, $src}",
4698 [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4699 IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4700 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4702 //===---------------------------------------------------------------------===//
4703 // Move Packed Doubleword Int to Packed Double Int
4705 let ExeDomain = SSEPackedInt in {
4706 def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4707 "movd\t{$src, $dst|$dst, $src}",
4708 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4709 (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
4711 def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
4712 (ins i32mem:$dst, VR128:$src),
4713 "movd\t{$src, $dst|$dst, $src}",
4714 [(store (i32 (extractelt (v4i32 VR128:$src),
4715 (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
4716 VEX, Sched<[WriteStore]>;
4717 def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4718 "movd\t{$src, $dst|$dst, $src}",
4719 [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4720 (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
4722 def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4723 "movd\t{$src, $dst|$dst, $src}",
4724 [(store (i32 (extractelt (v4i32 VR128:$src),
4725 (iPTR 0))), addr:$dst)],
4726 IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4727 } // ExeDomain = SSEPackedInt
4729 def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
4730 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4732 def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
4733 (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4735 def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
4736 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4738 def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
4739 (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4741 //===---------------------------------------------------------------------===//
4742 // Move Packed Doubleword Int first element to Doubleword Int
4744 let ExeDomain = SSEPackedInt in {
4745 let SchedRW = [WriteMove] in {
4746 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4747 "movq\t{$src, $dst|$dst, $src}",
4748 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4753 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4754 "mov{d|q}\t{$src, $dst|$dst, $src}",
4755 [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4760 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4761 def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
4762 (ins i64mem:$dst, VR128:$src),
4763 "movq\t{$src, $dst|$dst, $src}",
4764 [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4765 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4766 def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4767 "mov{d|q}\t{$src, $dst|$dst, $src}",
4768 [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4769 } // ExeDomain = SSEPackedInt
4771 //===---------------------------------------------------------------------===//
4772 // Bitcast FR64 <-> GR64
4774 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4775 let Predicates = [UseAVX] in
4776 def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4777 "movq\t{$src, $dst|$dst, $src}",
4778 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4779 VEX, Sched<[WriteLoad]>;
4780 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4781 "movq\t{$src, $dst|$dst, $src}",
4782 [(set GR64:$dst, (bitconvert FR64:$src))],
4783 IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4784 def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4785 "movq\t{$src, $dst|$dst, $src}",
4786 [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4787 IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4789 def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4790 "movq\t{$src, $dst|$dst, $src}",
4791 [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
4792 IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4793 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4794 "mov{d|q}\t{$src, $dst|$dst, $src}",
4795 [(set GR64:$dst, (bitconvert FR64:$src))],
4796 IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4797 def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4798 "movq\t{$src, $dst|$dst, $src}",
4799 [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4800 IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4801 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4803 //===---------------------------------------------------------------------===//
4804 // Move Scalar Single to Double Int
4806 let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4807 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4808 "movd\t{$src, $dst|$dst, $src}",
4809 [(set GR32:$dst, (bitconvert FR32:$src))],
4810 IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
4811 def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4812 "movd\t{$src, $dst|$dst, $src}",
4813 [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4814 IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4815 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4816 "movd\t{$src, $dst|$dst, $src}",
4817 [(set GR32:$dst, (bitconvert FR32:$src))],
4818 IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4819 def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4820 "movd\t{$src, $dst|$dst, $src}",
4821 [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4822 IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4823 } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4825 let Predicates = [UseAVX] in {
4826 let AddedComplexity = 15 in {
4827 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4828 (VMOVDI2PDIrr GR32:$src)>;
4830 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4831 (VMOV64toPQIrr GR64:$src)>;
4833 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4834 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4835 (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
4837 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4838 // These instructions also write zeros in the high part of a 256-bit register.
4839 let AddedComplexity = 20 in {
4840 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4841 (VMOVDI2PDIrm addr:$src)>;
4842 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4843 (VMOVDI2PDIrm addr:$src)>;
4844 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4845 (VMOVDI2PDIrm addr:$src)>;
4846 def : Pat<(v4i32 (X86vzload addr:$src)),
4847 (VMOVDI2PDIrm addr:$src)>;
4848 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4849 (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
4850 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
4851 def : Pat<(v8i32 (X86vzload addr:$src)),
4852 (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
4854 // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4855 def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4856 (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4857 (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
4860 let Predicates = [UseSSE2] in {
4861 let AddedComplexity = 15 in {
4862 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4863 (MOVDI2PDIrr GR32:$src)>;
4865 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4866 (MOV64toPQIrr GR64:$src)>;
4868 let AddedComplexity = 20 in {
4869 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4870 (MOVDI2PDIrm addr:$src)>;
4871 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4872 (MOVDI2PDIrm addr:$src)>;
4873 def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4874 (MOVDI2PDIrm addr:$src)>;
4875 def : Pat<(v4i32 (X86vzload addr:$src)),
4876 (MOVDI2PDIrm addr:$src)>;
4880 // These are the correct encodings of the instructions so that we know how to
4881 // read correct assembly, even though we continue to emit the wrong ones for
4882 // compatibility with Darwin's buggy assembler.
4883 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4884 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4885 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4886 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4887 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4888 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4889 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4890 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4891 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4893 //===---------------------------------------------------------------------===//
4894 // SSE2 - Move Quadword
4895 //===---------------------------------------------------------------------===//
4897 //===---------------------------------------------------------------------===//
4898 // Move Quadword Int to Packed Quadword Int
4901 let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
4902 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4903 "vmovq\t{$src, $dst|$dst, $src}",
4905 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4906 VEX, Requires<[UseAVX]>;
4907 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4908 "movq\t{$src, $dst|$dst, $src}",
4910 (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
4912 Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4913 } // ExeDomain, SchedRW
4915 //===---------------------------------------------------------------------===//
4916 // Move Packed Quadword Int to Quadword Int
4918 let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
4919 def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4920 "movq\t{$src, $dst|$dst, $src}",
4921 [(store (i64 (extractelt (v2i64 VR128:$src),
4922 (iPTR 0))), addr:$dst)],
4923 IIC_SSE_MOVDQ>, VEX;
4924 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4925 "movq\t{$src, $dst|$dst, $src}",
4926 [(store (i64 (extractelt (v2i64 VR128:$src),
4927 (iPTR 0))), addr:$dst)],
4929 } // ExeDomain, SchedRW
4931 // For disassembler only
4932 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4933 SchedRW = [WriteVecLogic] in {
4934 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4935 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
4936 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4937 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
4940 // Aliases to help the assembler pick two byte VEX encodings by swapping the
4941 // operands relative to the normal instructions to use VEX.R instead of VEX.B.
4942 def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
4943 (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
4945 let Predicates = [UseAVX], AddedComplexity = 20 in {
4946 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
4947 (VMOVQI2PQIrm addr:$src)>;
4948 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4949 (VMOVQI2PQIrm addr:$src)>;
4950 def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4951 (VMOVQI2PQIrm addr:$src)>;
4952 def : Pat<(v2i64 (X86vzload addr:$src)),
4953 (VMOVQI2PQIrm addr:$src)>;
4954 def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4955 (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
4956 (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
4957 def : Pat<(v4i64 (X86vzload addr:$src)),
4958 (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
4961 let Predicates = [UseSSE2], AddedComplexity = 20 in {
4962 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
4963 (MOVQI2PQIrm addr:$src)>;
4964 def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4965 (MOVQI2PQIrm addr:$src)>;
4966 def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4967 (MOVQI2PQIrm addr:$src)>;
4968 def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
4971 //===---------------------------------------------------------------------===//
4972 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4973 // IA32 document. movq xmm1, xmm2 does clear the high bits.
4975 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
4976 let AddedComplexity = 15 in
4977 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4978 "vmovq\t{$src, $dst|$dst, $src}",
4979 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
4981 XS, VEX, Requires<[UseAVX]>;
4982 let AddedComplexity = 15 in
4983 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4984 "movq\t{$src, $dst|$dst, $src}",
4985 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
4987 XS, Requires<[UseSSE2]>;
4988 } // ExeDomain, SchedRW
4990 let AddedComplexity = 20 in {
4991 let Predicates = [UseAVX] in {
4992 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4993 (VMOVZPQILo2PQIrr VR128:$src)>;
4995 let Predicates = [UseSSE2] in {
4996 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4997 (MOVZPQILo2PQIrr VR128:$src)>;
5001 //===---------------------------------------------------------------------===//
5002 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
5003 //===---------------------------------------------------------------------===//
5004 multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
5005 ValueType vt, RegisterClass RC, PatFrag mem_frag,
5006 X86MemOperand x86memop> {
5007 def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
5008 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5009 [(set RC:$dst, (vt (OpNode RC:$src)))],
5010 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5011 def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
5012 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5013 [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
5014 IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5017 let Predicates = [HasAVX, NoVLX] in {
5018 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5019 v4f32, VR128, loadv4f32, f128mem>, VEX;
5020 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5021 v4f32, VR128, loadv4f32, f128mem>, VEX;
5022 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5023 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5024 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5025 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5027 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
5028 memopv4f32, f128mem>;
5029 defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
5030 memopv4f32, f128mem>;
5032 let Predicates = [HasAVX, NoVLX] in {
5033 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5034 (VMOVSHDUPrr VR128:$src)>;
5035 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
5036 (VMOVSHDUPrm addr:$src)>;
5037 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5038 (VMOVSLDUPrr VR128:$src)>;
5039 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
5040 (VMOVSLDUPrm addr:$src)>;
5041 def : Pat<(v8i32 (X86Movshdup VR256:$src)),
5042 (VMOVSHDUPYrr VR256:$src)>;
5043 def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
5044 (VMOVSHDUPYrm addr:$src)>;
5045 def : Pat<(v8i32 (X86Movsldup VR256:$src)),
5046 (VMOVSLDUPYrr VR256:$src)>;
5047 def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
5048 (VMOVSLDUPYrm addr:$src)>;
5051 let Predicates = [UseSSE3] in {
5052 def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5053 (MOVSHDUPrr VR128:$src)>;
5054 def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
5055 (MOVSHDUPrm addr:$src)>;
5056 def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5057 (MOVSLDUPrr VR128:$src)>;
5058 def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
5059 (MOVSLDUPrm addr:$src)>;
5062 //===---------------------------------------------------------------------===//
5063 // SSE3 - Replicate Double FP - MOVDDUP
5064 //===---------------------------------------------------------------------===//
5066 multiclass sse3_replicate_dfp<string OpcodeStr> {
5067 def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5068 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5069 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
5070 IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5071 def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
5072 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5075 (scalar_to_vector (loadf64 addr:$src)))))],
5076 IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5079 // FIXME: Merge with above classe when there're patterns for the ymm version
5080 multiclass sse3_replicate_dfp_y<string OpcodeStr> {
5081 def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
5082 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5083 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
5084 Sched<[WriteFShuffle]>;
5085 def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
5086 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5088 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
5092 let Predicates = [HasAVX, NoVLX] in {
5093 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX;
5094 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
5097 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
5100 let Predicates = [HasAVX, NoVLX] in {
5101 def : Pat<(X86Movddup (loadv2f64 addr:$src)),
5102 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5105 def : Pat<(X86Movddup (loadv4i64 addr:$src)),
5106 (VMOVDDUPYrm addr:$src)>;
5107 def : Pat<(X86Movddup (v4i64 VR256:$src)),
5108 (VMOVDDUPYrr VR256:$src)>;
5111 let Predicates = [HasAVX] in {
5112 def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
5113 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5114 def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
5115 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5116 def : Pat<(X86Movddup (bc_v2f64
5117 (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5118 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5121 let Predicates = [HasAVX, NoVLX] in
5122 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
5123 (VMOVDDUPrm addr:$src)>;
5124 let Predicates = [HasAVX1Only] in
5125 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
5126 (VMOVDDUPrm addr:$src)>;
5128 let Predicates = [UseSSE3] in {
5129 def : Pat<(X86Movddup (memopv2f64 addr:$src)),
5130 (MOVDDUPrm addr:$src)>;
5131 def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
5132 (MOVDDUPrm addr:$src)>;
5133 def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
5134 (MOVDDUPrm addr:$src)>;
5135 def : Pat<(X86Movddup (bc_v2f64
5136 (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5137 (MOVDDUPrm addr:$src)>;
5140 //===---------------------------------------------------------------------===//
5141 // SSE3 - Move Unaligned Integer
5142 //===---------------------------------------------------------------------===//
5144 let SchedRW = [WriteLoad] in {
5145 let Predicates = [HasAVX] in {
5146 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5147 "vlddqu\t{$src, $dst|$dst, $src}",
5148 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
5149 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
5150 "vlddqu\t{$src, $dst|$dst, $src}",
5151 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
5154 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5155 "lddqu\t{$src, $dst|$dst, $src}",
5156 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
5160 //===---------------------------------------------------------------------===//
5161 // SSE3 - Arithmetic
5162 //===---------------------------------------------------------------------===//
5164 multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
5165 X86MemOperand x86memop, OpndItins itins,
5166 PatFrag ld_frag, bit Is2Addr = 1> {
5167 def rr : I<0xD0, MRMSrcReg,
5168 (outs RC:$dst), (ins RC:$src1, RC:$src2),
5170 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5171 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5172 [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
5173 Sched<[itins.Sched]>;
5174 def rm : I<0xD0, MRMSrcMem,
5175 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5177 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5178 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5179 [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
5180 Sched<[itins.Sched.Folded, ReadAfterLd]>;
5183 let Predicates = [HasAVX] in {
5184 let ExeDomain = SSEPackedSingle in {
5185 defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
5186 f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
5187 defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
5188 f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
5190 let ExeDomain = SSEPackedDouble in {
5191 defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
5192 f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
5193 defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
5194 f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
5197 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
5198 let ExeDomain = SSEPackedSingle in
5199 defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
5200 f128mem, SSE_ALU_F32P, memopv4f32>, XD;
5201 let ExeDomain = SSEPackedDouble in
5202 defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
5203 f128mem, SSE_ALU_F64P, memopv2f64>, PD;
5206 // Patterns used to select 'addsub' instructions.
5207 let Predicates = [HasAVX] in {
5208 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
5209 (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
5210 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
5211 (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
5212 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
5213 (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
5214 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
5215 (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
5217 def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
5218 (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
5219 def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
5220 (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
5221 def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
5222 (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
5223 def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
5224 (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
5227 let Predicates = [UseSSE3] in {
5228 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
5229 (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
5230 def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
5231 (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
5232 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
5233 (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
5234 def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
5235 (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
5238 //===---------------------------------------------------------------------===//
5239 // SSE3 Instructions
5240 //===---------------------------------------------------------------------===//
5243 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5244 X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
5246 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5248 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5249 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5250 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5253 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5255 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5256 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5257 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
5258 IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5260 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5261 X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
5263 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5265 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5266 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5267 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5270 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5272 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5273 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5274 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
5275 IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5278 let Predicates = [HasAVX] in {
5279 let ExeDomain = SSEPackedSingle in {
5280 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
5281 X86fhadd, loadv4f32, 0>, VEX_4V;
5282 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
5283 X86fhsub, loadv4f32, 0>, VEX_4V;
5284 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
5285 X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
5286 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
5287 X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
5289 let ExeDomain = SSEPackedDouble in {
5290 defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
5291 X86fhadd, loadv2f64, 0>, VEX_4V;
5292 defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
5293 X86fhsub, loadv2f64, 0>, VEX_4V;
5294 defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
5295 X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
5296 defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
5297 X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
5301 let Constraints = "$src1 = $dst" in {
5302 let ExeDomain = SSEPackedSingle in {
5303 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
5305 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
5308 let ExeDomain = SSEPackedDouble in {
5309 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
5311 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
5316 //===---------------------------------------------------------------------===//
5317 // SSSE3 - Packed Absolute Instructions
5318 //===---------------------------------------------------------------------===//
5321 /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5322 multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
5323 SDNode OpNode, PatFrag ld_frag> {
5324 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5326 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5327 [(set VR128:$dst, (vt (OpNode VR128:$src)))],
5328 IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
5330 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5332 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5334 (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
5335 IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
5338 /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5339 multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
5341 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5343 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5344 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
5345 Sched<[WriteVecALU]>;
5347 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5349 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5351 (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
5352 Sched<[WriteVecALULd]>;
5355 // Helper fragments to match sext vXi1 to vXiY.
5356 def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
5358 def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
5359 def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
5360 def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
5362 def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
5363 def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
5365 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5366 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, X86Abs, loadv2i64>, VEX;
5367 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, X86Abs, loadv2i64>, VEX;
5369 let Predicates = [HasAVX, NoVLX] in {
5370 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
5373 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5375 (bc_v2i64 (v16i1sextv16i8)),
5376 (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5377 (VPABSBrr VR128:$src)>;
5379 (bc_v2i64 (v8i1sextv8i16)),
5380 (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5381 (VPABSWrr VR128:$src)>;
5383 let Predicates = [HasAVX, NoVLX] in {
5385 (bc_v2i64 (v4i1sextv4i32)),
5386 (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5387 (VPABSDrr VR128:$src)>;
5390 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5391 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, X86Abs>, VEX, VEX_L;
5392 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, X86Abs>, VEX, VEX_L;
5394 let Predicates = [HasAVX2, NoVLX] in {
5395 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
5398 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5400 (bc_v4i64 (v32i1sextv32i8)),
5401 (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
5402 (VPABSBYrr VR256:$src)>;
5404 (bc_v4i64 (v16i1sextv16i16)),
5405 (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
5406 (VPABSWYrr VR256:$src)>;
5408 let Predicates = [HasAVX2, NoVLX] in {
5410 (bc_v4i64 (v8i1sextv8i32)),
5411 (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
5412 (VPABSDYrr VR256:$src)>;
5415 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
5416 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, X86Abs, memopv2i64>;
5417 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, X86Abs, memopv2i64>;
5419 let Predicates = [UseSSSE3] in {
5421 (bc_v2i64 (v16i1sextv16i8)),
5422 (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5423 (PABSBrr VR128:$src)>;
5425 (bc_v2i64 (v8i1sextv8i16)),
5426 (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5427 (PABSWrr VR128:$src)>;
5429 (bc_v2i64 (v4i1sextv4i32)),
5430 (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5431 (PABSDrr VR128:$src)>;
5434 //===---------------------------------------------------------------------===//
5435 // SSSE3 - Packed Binary Operator Instructions
5436 //===---------------------------------------------------------------------===//
5438 let Sched = WriteVecALU in {
5439 def SSE_PHADDSUBD : OpndItins<
5440 IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
5442 def SSE_PHADDSUBSW : OpndItins<
5443 IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
5445 def SSE_PHADDSUBW : OpndItins<
5446 IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
5449 let Sched = WriteShuffle in
5450 def SSE_PSHUFB : OpndItins<
5451 IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
5453 let Sched = WriteVecALU in
5454 def SSE_PSIGN : OpndItins<
5455 IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
5457 let Sched = WriteVecIMul in
5458 def SSE_PMULHRSW : OpndItins<
5459 IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
5462 /// SS3I_binop_rm - Simple SSSE3 bin op
5463 multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5464 ValueType DstVT, ValueType OpVT, RegisterClass RC,
5465 PatFrag memop_frag, X86MemOperand x86memop,
5466 OpndItins itins, bit Is2Addr = 1> {
5467 let isCommutable = 1 in
5468 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
5469 (ins RC:$src1, RC:$src2),
5471 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5472 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5473 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))], itins.rr>,
5474 Sched<[itins.Sched]>;
5475 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
5476 (ins RC:$src1, x86memop:$src2),
5478 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5479 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5481 (DstVT (OpNode (OpVT RC:$src1),
5482 (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
5483 Sched<[itins.Sched.Folded, ReadAfterLd]>;
5486 /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
5487 multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
5488 Intrinsic IntId128, OpndItins itins,
5489 PatFrag ld_frag, bit Is2Addr = 1> {
5490 let isCommutable = 1 in
5491 def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5492 (ins VR128:$src1, VR128:$src2),
5494 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5495 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5496 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
5497 Sched<[itins.Sched]>;
5498 def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5499 (ins VR128:$src1, i128mem:$src2),
5501 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5502 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5504 (IntId128 VR128:$src1,
5505 (bitconvert (ld_frag addr:$src2))))]>,
5506 Sched<[itins.Sched.Folded, ReadAfterLd]>;
5509 multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
5511 X86FoldableSchedWrite Sched> {
5512 let isCommutable = 1 in
5513 def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5514 (ins VR256:$src1, VR256:$src2),
5515 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5516 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
5518 def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5519 (ins VR256:$src1, i256mem:$src2),
5520 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5522 (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
5523 Sched<[Sched.Folded, ReadAfterLd]>;
5526 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5527 let isCommutable = 0 in {
5528 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
5529 VR128, loadv2i64, i128mem,
5530 SSE_PSHUFB, 0>, VEX_4V;
5531 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
5532 v16i8, VR128, loadv2i64, i128mem,
5533 SSE_PMADD, 0>, VEX_4V;
5535 defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
5536 VR128, loadv2i64, i128mem,
5537 SSE_PMULHRSW, 0>, VEX_4V;
5540 let ImmT = NoImm, Predicates = [HasAVX] in {
5541 let isCommutable = 0 in {
5542 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
5544 SSE_PHADDSUBW, 0>, VEX_4V;
5545 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
5547 SSE_PHADDSUBD, 0>, VEX_4V;
5548 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
5550 SSE_PHADDSUBW, 0>, VEX_4V;
5551 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
5553 SSE_PHADDSUBD, 0>, VEX_4V;
5554 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
5555 int_x86_ssse3_psign_b_128,
5556 SSE_PSIGN, loadv2i64, 0>, VEX_4V;
5557 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
5558 int_x86_ssse3_psign_w_128,
5559 SSE_PSIGN, loadv2i64, 0>, VEX_4V;
5560 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
5561 int_x86_ssse3_psign_d_128,
5562 SSE_PSIGN, loadv2i64, 0>, VEX_4V;
5563 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
5564 int_x86_ssse3_phadd_sw_128,
5565 SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
5566 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
5567 int_x86_ssse3_phsub_sw_128,
5568 SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
5572 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5573 let isCommutable = 0 in {
5574 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
5575 VR256, loadv4i64, i256mem,
5576 SSE_PSHUFB, 0>, VEX_4V, VEX_L;
5577 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
5578 v32i8, VR256, loadv4i64, i256mem,
5579 SSE_PMADD, 0>, VEX_4V, VEX_L;
5581 defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
5582 VR256, loadv4i64, i256mem,
5583 SSE_PMULHRSW, 0>, VEX_4V, VEX_L;
5586 let ImmT = NoImm, Predicates = [HasAVX2] in {
5587 let isCommutable = 0 in {
5588 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
5589 VR256, loadv4i64, i256mem,
5590 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5591 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
5593 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5594 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
5595 VR256, loadv4i64, i256mem,
5596 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5597 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
5599 SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5600 defm VPSIGNBY : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
5601 WriteVecALU>, VEX_4V, VEX_L;
5602 defm VPSIGNWY : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
5603 WriteVecALU>, VEX_4V, VEX_L;
5604 defm VPSIGNDY : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
5605 WriteVecALU>, VEX_4V, VEX_L;
5606 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
5607 int_x86_avx2_phadd_sw,
5608 WriteVecALU>, VEX_4V, VEX_L;
5609 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
5610 int_x86_avx2_phsub_sw,
5611 WriteVecALU>, VEX_4V, VEX_L;
5615 // None of these have i8 immediate fields.
5616 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
5617 let isCommutable = 0 in {
5618 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
5619 memopv2i64, i128mem, SSE_PHADDSUBW>;
5620 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
5621 memopv2i64, i128mem, SSE_PHADDSUBD>;
5622 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
5623 memopv2i64, i128mem, SSE_PHADDSUBW>;
5624 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
5625 memopv2i64, i128mem, SSE_PHADDSUBD>;
5626 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
5627 SSE_PSIGN, memopv2i64>;
5628 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
5629 SSE_PSIGN, memopv2i64>;
5630 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
5631 SSE_PSIGN, memopv2i64>;
5632 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
5633 memopv2i64, i128mem, SSE_PSHUFB>;
5634 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
5635 int_x86_ssse3_phadd_sw_128,
5636 SSE_PHADDSUBSW, memopv2i64>;
5637 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
5638 int_x86_ssse3_phsub_sw_128,
5639 SSE_PHADDSUBSW, memopv2i64>;
5640 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
5641 v16i8, VR128, memopv2i64, i128mem,
5644 defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
5645 VR128, memopv2i64, i128mem, SSE_PMULHRSW>;
5648 //===---------------------------------------------------------------------===//
5649 // SSSE3 - Packed Align Instruction Patterns
5650 //===---------------------------------------------------------------------===//
5652 multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
5653 let hasSideEffects = 0 in {
5654 def rri : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
5655 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5657 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5659 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5660 [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
5662 def rmi : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
5663 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
5665 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5667 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5668 [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5672 multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
5673 let hasSideEffects = 0 in {
5674 def Yrri : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
5675 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
5677 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5678 []>, Sched<[WriteShuffle]>;
5680 def Yrmi : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
5681 (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
5683 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5684 []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5688 let Predicates = [HasAVX] in
5689 defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V;
5690 let Predicates = [HasAVX2] in
5691 defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
5692 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
5693 defm PALIGNR : ssse3_palignr<"palignr">;
5695 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5696 def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5697 (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
5698 def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5699 (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
5700 def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5701 (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
5702 def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5703 (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
5706 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5707 def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5708 (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
5709 def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5710 (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
5711 def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5712 (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
5713 def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5714 (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
5717 let Predicates = [UseSSSE3] in {
5718 def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5719 (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
5720 def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5721 (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
5722 def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5723 (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
5724 def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5725 (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
5728 //===---------------------------------------------------------------------===//
5729 // SSSE3 - Thread synchronization
5730 //===---------------------------------------------------------------------===//
5732 let SchedRW = [WriteSystem] in {
5733 let usesCustomInserter = 1 in {
5734 def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
5735 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
5736 Requires<[HasSSE3]>;
5739 let Uses = [EAX, ECX, EDX] in
5740 def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
5741 TB, Requires<[HasSSE3]>;
5743 let Uses = [ECX, EAX] in
5744 def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
5745 [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
5746 TB, Requires<[HasSSE3]>;
5749 def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
5750 def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
5752 def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
5753 Requires<[Not64BitMode]>;
5754 def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
5755 Requires<[In64BitMode]>;
5757 //===----------------------------------------------------------------------===//
5758 // SSE4.1 - Packed Move with Sign/Zero Extend
5759 //===----------------------------------------------------------------------===//
5761 multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5762 RegisterClass OutRC, RegisterClass InRC,
5764 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
5765 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5767 Sched<[itins.Sched]>;
5769 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
5770 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5772 itins.rm>, Sched<[itins.Sched.Folded]>;
5775 multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
5776 X86MemOperand MemOp, X86MemOperand MemYOp,
5777 OpndItins SSEItins, OpndItins AVXItins,
5778 OpndItins AVX2Itins, Predicate prd> {
5779 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
5780 let Predicates = [HasAVX, prd] in
5781 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5782 VR128, VR128, AVXItins>, VEX;
5783 let Predicates = [HasAVX2, prd] in
5784 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5785 VR256, VR128, AVX2Itins>, VEX, VEX_L;
5788 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5789 X86MemOperand MemYOp, Predicate prd> {
5790 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5792 SSE_INTALU_ITINS_SHUFF_P,
5793 DEFAULT_ITINS_SHUFFLESCHED,
5794 DEFAULT_ITINS_SHUFFLESCHED, prd>;
5795 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5796 !strconcat("pmovzx", OpcodeStr),
5798 SSE_INTALU_ITINS_SHUFF_P,
5799 DEFAULT_ITINS_SHUFFLESCHED,
5800 DEFAULT_ITINS_SHUFFLESCHED, prd>;
5803 defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
5804 defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
5805 defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
5807 defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
5808 defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
5810 defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
5813 multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
5814 // Register-Register patterns
5815 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5816 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5817 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5819 let Predicates = [HasAVX, NoVLX] in {
5820 def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
5821 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5822 def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
5823 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5825 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5826 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5827 def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
5828 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5830 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5831 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5834 // Simple Register-Memory patterns
5835 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5836 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5837 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5839 let Predicates = [HasAVX, NoVLX] in {
5840 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5841 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5842 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5843 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5845 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5846 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5847 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5848 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5850 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5851 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5854 // AVX2 Register-Memory patterns
5855 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5856 def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5857 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5858 def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5859 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5860 def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5861 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5863 let Predicates = [HasAVX, NoVLX] in {
5864 def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5865 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5866 def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5867 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5868 def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5869 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5870 def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5871 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5873 def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5874 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5875 def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5876 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5877 def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5878 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5879 def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5880 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5882 def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5883 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5884 def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5885 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5886 def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5887 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5889 def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5890 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5891 def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5892 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5893 def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5894 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5895 def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5896 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5898 def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
5899 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5900 def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
5901 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5902 def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
5903 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5907 defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
5908 defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
5910 // SSE4.1/AVX patterns.
5911 multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5912 SDNode ExtOp, PatFrag ExtLoad16> {
5913 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5914 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5915 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5917 let Predicates = [HasAVX, NoVLX] in {
5918 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5919 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5920 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5921 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5923 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5924 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5925 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5926 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5928 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5929 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5931 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5932 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5933 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5935 let Predicates = [HasAVX, NoVLX] in {
5936 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5937 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5938 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5939 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5941 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5942 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5943 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5944 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5946 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5947 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5949 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5950 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5951 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5952 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5953 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5954 def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
5955 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5956 def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5957 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5958 def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5959 (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5961 let Predicates = [HasAVX, NoVLX] in {
5962 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5963 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5964 def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5965 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5966 def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5967 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5968 def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5969 (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5971 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
5972 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5973 def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
5974 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5975 def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
5976 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5977 def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
5978 (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5980 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5981 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5982 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5983 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5984 def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
5985 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5986 def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5987 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5988 def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5989 (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5991 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5992 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5993 def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
5994 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5995 def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
5996 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5997 def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
5998 (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
6000 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
6001 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6002 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
6003 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6004 def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
6005 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6006 def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
6007 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6008 def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
6009 (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
6013 defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
6014 defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
6016 let Predicates = [UseSSE41] in {
6017 defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
6018 defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
6021 //===----------------------------------------------------------------------===//
6022 // SSE4.1 - Extract Instructions
6023 //===----------------------------------------------------------------------===//
6025 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
6026 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
6027 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6028 (ins VR128:$src1, u8imm:$src2),
6029 !strconcat(OpcodeStr,
6030 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6031 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
6033 Sched<[WriteShuffle]>;
6034 let hasSideEffects = 0, mayStore = 1,
6035 SchedRW = [WriteShuffleLd, WriteRMW] in
6036 def mr : SS4AIi8<opc, MRMDestMem, (outs),
6037 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
6038 !strconcat(OpcodeStr,
6039 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6040 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
6041 imm:$src2)))), addr:$dst)]>;
6044 let Predicates = [HasAVX, NoBWI] in
6045 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
6047 defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
6050 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
6051 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
6052 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
6053 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6054 (ins VR128:$src1, u8imm:$src2),
6055 !strconcat(OpcodeStr,
6056 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6057 []>, Sched<[WriteShuffle]>;
6059 let hasSideEffects = 0, mayStore = 1,
6060 SchedRW = [WriteShuffleLd, WriteRMW] in
6061 def mr : SS4AIi8<opc, MRMDestMem, (outs),
6062 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
6063 !strconcat(OpcodeStr,
6064 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6065 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
6066 imm:$src2)))), addr:$dst)]>;
6069 let Predicates = [HasAVX, NoBWI] in
6070 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
6072 defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
6075 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6076 multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
6077 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
6078 (ins VR128:$src1, u8imm:$src2),
6079 !strconcat(OpcodeStr,
6080 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6082 (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
6083 Sched<[WriteShuffle]>;
6084 let SchedRW = [WriteShuffleLd, WriteRMW] in
6085 def mr : SS4AIi8<opc, MRMDestMem, (outs),
6086 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
6087 !strconcat(OpcodeStr,
6088 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6089 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
6093 let Predicates = [HasAVX, NoDQI] in
6094 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
6096 defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
6098 /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6099 multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
6100 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
6101 (ins VR128:$src1, u8imm:$src2),
6102 !strconcat(OpcodeStr,
6103 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6105 (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
6106 Sched<[WriteShuffle]>, REX_W;
6107 let SchedRW = [WriteShuffleLd, WriteRMW] in
6108 def mr : SS4AIi8<opc, MRMDestMem, (outs),
6109 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
6110 !strconcat(OpcodeStr,
6111 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6112 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
6113 addr:$dst)]>, REX_W;
6116 let Predicates = [HasAVX, NoDQI] in
6117 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
6119 defm PEXTRQ : SS41I_extract64<0x16, "pextrq">;
6121 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
6123 multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
6124 OpndItins itins = DEFAULT_ITINS> {
6125 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6126 (ins VR128:$src1, u8imm:$src2),
6127 !strconcat(OpcodeStr,
6128 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6129 [(set GR32orGR64:$dst,
6130 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
6131 itins.rr>, Sched<[WriteFBlend]>;
6132 let SchedRW = [WriteFBlendLd, WriteRMW] in
6133 def mr : SS4AIi8<opc, MRMDestMem, (outs),
6134 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
6135 !strconcat(OpcodeStr,
6136 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6137 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
6138 addr:$dst)], itins.rm>;
6141 let ExeDomain = SSEPackedSingle in {
6142 let Predicates = [UseAVX] in
6143 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
6144 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
6147 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
6148 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6151 (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6153 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6156 (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6157 Requires<[UseSSE41]>;
6159 //===----------------------------------------------------------------------===//
6160 // SSE4.1 - Insert Instructions
6161 //===----------------------------------------------------------------------===//
6163 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
6164 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6165 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
6167 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6169 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6171 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
6172 Sched<[WriteShuffle]>;
6173 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6174 (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
6176 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6178 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6180 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
6181 imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6184 let Predicates = [HasAVX, NoBWI] in
6185 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
6186 let Constraints = "$src1 = $dst" in
6187 defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
6189 multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
6190 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6191 (ins VR128:$src1, GR32:$src2, u8imm:$src3),
6193 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6195 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6197 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
6198 Sched<[WriteShuffle]>;
6199 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6200 (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
6202 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6204 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6206 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
6207 imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6210 let Predicates = [HasAVX, NoDQI] in
6211 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
6212 let Constraints = "$src1 = $dst" in
6213 defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
6215 multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
6216 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6217 (ins VR128:$src1, GR64:$src2, u8imm:$src3),
6219 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6221 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6223 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
6224 Sched<[WriteShuffle]>;
6225 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6226 (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
6228 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6230 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6232 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
6233 imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6236 let Predicates = [HasAVX, NoDQI] in
6237 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
6238 let Constraints = "$src1 = $dst" in
6239 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
6241 // insertps has a few different modes, there's the first two here below which
6242 // are optimized inserts that won't zero arbitrary elements in the destination
6243 // vector. The next one matches the intrinsic and could zero arbitrary elements
6244 // in the target vector.
6245 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
6246 OpndItins itins = DEFAULT_ITINS> {
6247 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6248 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6250 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6252 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6254 (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
6255 Sched<[WriteFShuffle]>;
6256 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6257 (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
6259 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6261 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6263 (X86insertps VR128:$src1,
6264 (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
6265 imm:$src3))], itins.rm>,
6266 Sched<[WriteFShuffleLd, ReadAfterLd]>;
6269 let ExeDomain = SSEPackedSingle in {
6270 let Predicates = [UseAVX] in
6271 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
6272 let Constraints = "$src1 = $dst" in
6273 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
6276 let Predicates = [UseSSE41] in {
6277 // If we're inserting an element from a load or a null pshuf of a load,
6278 // fold the load into the insertps instruction.
6279 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
6280 (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
6282 (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6283 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
6284 (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
6285 (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6288 let Predicates = [UseAVX] in {
6289 // If we're inserting an element from a vbroadcast of a load, fold the
6290 // load into the X86insertps instruction.
6291 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
6292 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
6293 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6294 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
6295 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
6296 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6299 //===----------------------------------------------------------------------===//
6300 // SSE4.1 - Round Instructions
6301 //===----------------------------------------------------------------------===//
6303 multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
6304 X86MemOperand x86memop, RegisterClass RC,
6305 PatFrag mem_frag32, PatFrag mem_frag64,
6306 Intrinsic V4F32Int, Intrinsic V2F64Int> {
6307 let ExeDomain = SSEPackedSingle in {
6308 // Intrinsic operation, reg.
6309 // Vector intrinsic operation, reg
6310 def PSr : SS4AIi8<opcps, MRMSrcReg,
6311 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
6312 !strconcat(OpcodeStr,
6313 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6314 [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
6315 IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6317 // Vector intrinsic operation, mem
6318 def PSm : SS4AIi8<opcps, MRMSrcMem,
6319 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
6320 !strconcat(OpcodeStr,
6321 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6323 (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
6324 IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
6325 } // ExeDomain = SSEPackedSingle
6327 let ExeDomain = SSEPackedDouble in {
6328 // Vector intrinsic operation, reg
6329 def PDr : SS4AIi8<opcpd, MRMSrcReg,
6330 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
6331 !strconcat(OpcodeStr,
6332 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6333 [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
6334 IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6336 // Vector intrinsic operation, mem
6337 def PDm : SS4AIi8<opcpd, MRMSrcMem,
6338 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
6339 !strconcat(OpcodeStr,
6340 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6342 (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
6343 IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
6344 } // ExeDomain = SSEPackedDouble
6347 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
6349 let ExeDomain = GenericDomain, hasSideEffects = 0 in {
6350 def SSr : SS4AIi8<opcss, MRMSrcReg,
6351 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
6352 !strconcat(OpcodeStr,
6353 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6354 []>, Sched<[WriteFAdd]>;
6357 def SSm : SS4AIi8<opcss, MRMSrcMem,
6358 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
6359 !strconcat(OpcodeStr,
6360 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6361 []>, Sched<[WriteFAddLd, ReadAfterLd]>;
6363 def SDr : SS4AIi8<opcsd, MRMSrcReg,
6364 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
6365 !strconcat(OpcodeStr,
6366 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6367 []>, Sched<[WriteFAdd]>;
6370 def SDm : SS4AIi8<opcsd, MRMSrcMem,
6371 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
6372 !strconcat(OpcodeStr,
6373 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6374 []>, Sched<[WriteFAddLd, ReadAfterLd]>;
6375 } // ExeDomain = GenericDomain, hasSideEffects = 0
6378 multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
6380 let ExeDomain = GenericDomain, hasSideEffects = 0 in {
6381 def SSr : SS4AIi8<opcss, MRMSrcReg,
6382 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
6383 !strconcat(OpcodeStr,
6384 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6385 []>, Sched<[WriteFAdd]>;
6388 def SSm : SS4AIi8<opcss, MRMSrcMem,
6389 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
6390 !strconcat(OpcodeStr,
6391 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6392 []>, Sched<[WriteFAddLd, ReadAfterLd]>;
6394 def SDr : SS4AIi8<opcsd, MRMSrcReg,
6395 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
6396 !strconcat(OpcodeStr,
6397 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6398 []>, Sched<[WriteFAdd]>;
6401 def SDm : SS4AIi8<opcsd, MRMSrcMem,
6402 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
6403 !strconcat(OpcodeStr,
6404 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6405 []>, Sched<[WriteFAddLd, ReadAfterLd]>;
6406 } // ExeDomain = GenericDomain, hasSideEffects = 0
6409 multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
6412 Intrinsic F64Int, bit Is2Addr = 1> {
6413 let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
6414 def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
6415 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
6417 !strconcat(OpcodeStr,
6418 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6419 !strconcat(OpcodeStr,
6420 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6421 [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6424 def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
6425 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
6427 !strconcat(OpcodeStr,
6428 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6429 !strconcat(OpcodeStr,
6430 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6432 (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
6433 Sched<[WriteFAddLd, ReadAfterLd]>;
6435 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
6436 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
6438 !strconcat(OpcodeStr,
6439 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6440 !strconcat(OpcodeStr,
6441 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6442 [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6445 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
6446 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
6448 !strconcat(OpcodeStr,
6449 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6450 !strconcat(OpcodeStr,
6451 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6453 (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
6454 Sched<[WriteFAddLd, ReadAfterLd]>;
6455 } // ExeDomain = GenericDomain, isCodeGenOnly = 1
6458 // FP round - roundss, roundps, roundsd, roundpd
6459 let Predicates = [HasAVX] in {
6461 defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
6462 loadv4f32, loadv2f64,
6463 int_x86_sse41_round_ps,
6464 int_x86_sse41_round_pd>, VEX;
6465 defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
6466 loadv8f32, loadv4f64,
6467 int_x86_avx_round_ps_256,
6468 int_x86_avx_round_pd_256>, VEX, VEX_L;
6469 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
6470 int_x86_sse41_round_ss,
6471 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
6472 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
6475 let Predicates = [UseAVX] in {
6476 def : Pat<(ffloor FR32:$src),
6477 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
6478 def : Pat<(f64 (ffloor FR64:$src)),
6479 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
6480 def : Pat<(f32 (fnearbyint FR32:$src)),
6481 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6482 def : Pat<(f64 (fnearbyint FR64:$src)),
6483 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6484 def : Pat<(f32 (fceil FR32:$src)),
6485 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
6486 def : Pat<(f64 (fceil FR64:$src)),
6487 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
6488 def : Pat<(f32 (frint FR32:$src)),
6489 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6490 def : Pat<(f64 (frint FR64:$src)),
6491 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6492 def : Pat<(f32 (ftrunc FR32:$src)),
6493 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
6494 def : Pat<(f64 (ftrunc FR64:$src)),
6495 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
6498 let Predicates = [HasAVX] in {
6499 def : Pat<(v4f32 (ffloor VR128:$src)),
6500 (VROUNDPSr VR128:$src, (i32 0x9))>;
6501 def : Pat<(v4f32 (fnearbyint VR128:$src)),
6502 (VROUNDPSr VR128:$src, (i32 0xC))>;
6503 def : Pat<(v4f32 (fceil VR128:$src)),
6504 (VROUNDPSr VR128:$src, (i32 0xA))>;
6505 def : Pat<(v4f32 (frint VR128:$src)),
6506 (VROUNDPSr VR128:$src, (i32 0x4))>;
6507 def : Pat<(v4f32 (ftrunc VR128:$src)),
6508 (VROUNDPSr VR128:$src, (i32 0xB))>;
6510 def : Pat<(v2f64 (ffloor VR128:$src)),
6511 (VROUNDPDr VR128:$src, (i32 0x9))>;
6512 def : Pat<(v2f64 (fnearbyint VR128:$src)),
6513 (VROUNDPDr VR128:$src, (i32 0xC))>;
6514 def : Pat<(v2f64 (fceil VR128:$src)),
6515 (VROUNDPDr VR128:$src, (i32 0xA))>;
6516 def : Pat<(v2f64 (frint VR128:$src)),
6517 (VROUNDPDr VR128:$src, (i32 0x4))>;
6518 def : Pat<(v2f64 (ftrunc VR128:$src)),
6519 (VROUNDPDr VR128:$src, (i32 0xB))>;
6521 def : Pat<(v8f32 (ffloor VR256:$src)),
6522 (VROUNDYPSr VR256:$src, (i32 0x9))>;
6523 def : Pat<(v8f32 (fnearbyint VR256:$src)),
6524 (VROUNDYPSr VR256:$src, (i32 0xC))>;
6525 def : Pat<(v8f32 (fceil VR256:$src)),
6526 (VROUNDYPSr VR256:$src, (i32 0xA))>;
6527 def : Pat<(v8f32 (frint VR256:$src)),
6528 (VROUNDYPSr VR256:$src, (i32 0x4))>;
6529 def : Pat<(v8f32 (ftrunc VR256:$src)),
6530 (VROUNDYPSr VR256:$src, (i32 0xB))>;
6532 def : Pat<(v4f64 (ffloor VR256:$src)),
6533 (VROUNDYPDr VR256:$src, (i32 0x9))>;
6534 def : Pat<(v4f64 (fnearbyint VR256:$src)),
6535 (VROUNDYPDr VR256:$src, (i32 0xC))>;
6536 def : Pat<(v4f64 (fceil VR256:$src)),
6537 (VROUNDYPDr VR256:$src, (i32 0xA))>;
6538 def : Pat<(v4f64 (frint VR256:$src)),
6539 (VROUNDYPDr VR256:$src, (i32 0x4))>;
6540 def : Pat<(v4f64 (ftrunc VR256:$src)),
6541 (VROUNDYPDr VR256:$src, (i32 0xB))>;
6544 defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
6545 memopv4f32, memopv2f64, int_x86_sse41_round_ps,
6546 int_x86_sse41_round_pd>;
6548 defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;
6550 let Constraints = "$src1 = $dst" in
6551 defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round",
6552 int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
6554 let Predicates = [UseSSE41] in {
6555 def : Pat<(ffloor FR32:$src),
6556 (ROUNDSSr FR32:$src, (i32 0x9))>;
6557 def : Pat<(f64 (ffloor FR64:$src)),
6558 (ROUNDSDr FR64:$src, (i32 0x9))>;
6559 def : Pat<(f32 (fnearbyint FR32:$src)),
6560 (ROUNDSSr FR32:$src, (i32 0xC))>;
6561 def : Pat<(f64 (fnearbyint FR64:$src)),
6562 (ROUNDSDr FR64:$src, (i32 0xC))>;
6563 def : Pat<(f32 (fceil FR32:$src)),
6564 (ROUNDSSr FR32:$src, (i32 0xA))>;
6565 def : Pat<(f64 (fceil FR64:$src)),
6566 (ROUNDSDr FR64:$src, (i32 0xA))>;
6567 def : Pat<(f32 (frint FR32:$src)),
6568 (ROUNDSSr FR32:$src, (i32 0x4))>;
6569 def : Pat<(f64 (frint FR64:$src)),
6570 (ROUNDSDr FR64:$src, (i32 0x4))>;
6571 def : Pat<(f32 (ftrunc FR32:$src)),
6572 (ROUNDSSr FR32:$src, (i32 0xB))>;
6573 def : Pat<(f64 (ftrunc FR64:$src)),
6574 (ROUNDSDr FR64:$src, (i32 0xB))>;
6576 def : Pat<(v4f32 (ffloor VR128:$src)),
6577 (ROUNDPSr VR128:$src, (i32 0x9))>;
6578 def : Pat<(v4f32 (fnearbyint VR128:$src)),
6579 (ROUNDPSr VR128:$src, (i32 0xC))>;
6580 def : Pat<(v4f32 (fceil VR128:$src)),
6581 (ROUNDPSr VR128:$src, (i32 0xA))>;
6582 def : Pat<(v4f32 (frint VR128:$src)),
6583 (ROUNDPSr VR128:$src, (i32 0x4))>;
6584 def : Pat<(v4f32 (ftrunc VR128:$src)),
6585 (ROUNDPSr VR128:$src, (i32 0xB))>;
6587 def : Pat<(v2f64 (ffloor VR128:$src)),
6588 (ROUNDPDr VR128:$src, (i32 0x9))>;
6589 def : Pat<(v2f64 (fnearbyint VR128:$src)),
6590 (ROUNDPDr VR128:$src, (i32 0xC))>;
6591 def : Pat<(v2f64 (fceil VR128:$src)),
6592 (ROUNDPDr VR128:$src, (i32 0xA))>;
6593 def : Pat<(v2f64 (frint VR128:$src)),
6594 (ROUNDPDr VR128:$src, (i32 0x4))>;
6595 def : Pat<(v2f64 (ftrunc VR128:$src)),
6596 (ROUNDPDr VR128:$src, (i32 0xB))>;
6599 //===----------------------------------------------------------------------===//
6600 // SSE4.1 - Packed Bit Test
6601 //===----------------------------------------------------------------------===//
6603 // ptest instruction we'll lower to this in X86ISelLowering primarily from
6604 // the intel intrinsic that corresponds to this.
6605 let Defs = [EFLAGS], Predicates = [HasAVX] in {
6606 def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6607 "vptest\t{$src2, $src1|$src1, $src2}",
6608 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6609 Sched<[WriteVecLogic]>, VEX;
6610 def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6611 "vptest\t{$src2, $src1|$src1, $src2}",
6612 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
6613 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6615 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
6616 "vptest\t{$src2, $src1|$src1, $src2}",
6617 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
6618 Sched<[WriteVecLogic]>, VEX, VEX_L;
6619 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
6620 "vptest\t{$src2, $src1|$src1, $src2}",
6621 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
6622 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
6625 let Defs = [EFLAGS] in {
6626 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6627 "ptest\t{$src2, $src1|$src1, $src2}",
6628 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6629 Sched<[WriteVecLogic]>;
6630 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6631 "ptest\t{$src2, $src1|$src1, $src2}",
6632 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6633 Sched<[WriteVecLogicLd, ReadAfterLd]>;
6636 // The bit test instructions below are AVX only
6637 multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
6638 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
6639 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
6640 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6641 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
6642 Sched<[WriteVecLogic]>, VEX;
6643 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
6644 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6645 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
6646 Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6649 let Defs = [EFLAGS], Predicates = [HasAVX] in {
6650 let ExeDomain = SSEPackedSingle in {
6651 defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
6652 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
6655 let ExeDomain = SSEPackedDouble in {
6656 defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
6657 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
6662 //===----------------------------------------------------------------------===//
6663 // SSE4.1 - Misc Instructions
6664 //===----------------------------------------------------------------------===//
6666 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
6667 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
6668 "popcnt{w}\t{$src, $dst|$dst, $src}",
6669 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
6670 IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6672 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
6673 "popcnt{w}\t{$src, $dst|$dst, $src}",
6674 [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
6675 (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6676 Sched<[WriteFAddLd]>, OpSize16, XS;
6678 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
6679 "popcnt{l}\t{$src, $dst|$dst, $src}",
6680 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
6681 IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6684 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
6685 "popcnt{l}\t{$src, $dst|$dst, $src}",
6686 [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
6687 (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6688 Sched<[WriteFAddLd]>, OpSize32, XS;
6690 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
6691 "popcnt{q}\t{$src, $dst|$dst, $src}",
6692 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
6693 IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
6694 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
6695 "popcnt{q}\t{$src, $dst|$dst, $src}",
6696 [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
6697 (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6698 Sched<[WriteFAddLd]>, XS;
6703 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
6704 multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
6705 Intrinsic IntId128, PatFrag ld_frag,
6706 X86FoldableSchedWrite Sched> {
6707 def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6709 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6710 [(set VR128:$dst, (IntId128 VR128:$src))]>,
6712 def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6714 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6716 (IntId128 (bitconvert (ld_frag addr:$src))))]>,
6717 Sched<[Sched.Folded]>;
6720 // PHMIN has the same profile as PSAD, thus we use the same scheduling
6721 // model, although the naming is misleading.
6722 let Predicates = [HasAVX] in
6723 defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
6724 int_x86_sse41_phminposuw, loadv2i64,
6726 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
6727 int_x86_sse41_phminposuw, memopv2i64,
6730 /// SS48I_binop_rm - Simple SSE41 binary operator.
6731 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6732 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6733 X86MemOperand x86memop, bit Is2Addr = 1,
6734 OpndItins itins = SSE_INTALU_ITINS_P> {
6735 let isCommutable = 1 in
6736 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6737 (ins RC:$src1, RC:$src2),
6739 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6740 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6741 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6742 Sched<[itins.Sched]>;
6743 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6744 (ins RC:$src1, x86memop:$src2),
6746 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6747 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6749 (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
6750 Sched<[itins.Sched.Folded, ReadAfterLd]>;
6753 /// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
6755 multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
6756 ValueType DstVT, ValueType SrcVT, RegisterClass RC,
6757 PatFrag memop_frag, X86MemOperand x86memop,
6759 bit IsCommutable = 0, bit Is2Addr = 1> {
6760 let isCommutable = IsCommutable in
6761 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6762 (ins RC:$src1, RC:$src2),
6764 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6765 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6766 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
6767 Sched<[itins.Sched]>;
6768 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6769 (ins RC:$src1, x86memop:$src2),
6771 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6772 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6773 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
6774 (bitconvert (memop_frag addr:$src2)))))]>,
6775 Sched<[itins.Sched.Folded, ReadAfterLd]>;
6778 let Predicates = [HasAVX, NoVLX] in {
6779 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
6780 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6782 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
6783 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6785 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
6786 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6788 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
6789 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6791 defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
6792 VR128, loadv2i64, i128mem,
6793 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
6795 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
6796 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
6797 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6799 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
6800 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6802 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
6803 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6805 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
6806 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6810 let Predicates = [HasAVX2, NoVLX] in {
6811 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
6812 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6814 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
6815 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6817 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
6818 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6820 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
6821 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6823 defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
6824 VR256, loadv4i64, i256mem,
6825 SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
6827 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
6828 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
6829 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6831 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
6832 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6834 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
6835 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6837 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
6838 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6842 let Constraints = "$src1 = $dst" in {
6843 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
6844 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6845 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
6846 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6847 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
6848 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6849 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
6850 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6851 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
6852 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6853 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
6854 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6855 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
6856 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6857 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
6858 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
6859 defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
6860 VR128, memopv2i64, i128mem,
6861 SSE_INTMUL_ITINS_P, 1>;
6864 let Predicates = [HasAVX, NoVLX] in {
6865 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
6866 loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
6868 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
6869 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6872 let Predicates = [HasAVX2] in {
6873 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
6874 loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
6876 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
6877 loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
6881 let Constraints = "$src1 = $dst" in {
6882 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
6883 memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
6884 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
6885 memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
6888 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
6889 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
6890 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
6891 X86MemOperand x86memop, bit Is2Addr = 1,
6892 OpndItins itins = DEFAULT_ITINS> {
6893 let isCommutable = 1 in
6894 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6895 (ins RC:$src1, RC:$src2, u8imm:$src3),
6897 !strconcat(OpcodeStr,
6898 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6899 !strconcat(OpcodeStr,
6900 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6901 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
6902 Sched<[itins.Sched]>;
6903 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6904 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6906 !strconcat(OpcodeStr,
6907 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6908 !strconcat(OpcodeStr,
6909 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6912 (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
6913 Sched<[itins.Sched.Folded, ReadAfterLd]>;
6916 /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
6917 multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6918 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6919 X86MemOperand x86memop, bit Is2Addr = 1,
6920 OpndItins itins = DEFAULT_ITINS> {
6921 let isCommutable = 1 in
6922 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6923 (ins RC:$src1, RC:$src2, u8imm:$src3),
6925 !strconcat(OpcodeStr,
6926 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6927 !strconcat(OpcodeStr,
6928 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6929 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
6930 itins.rr>, Sched<[itins.Sched]>;
6931 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6932 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6934 !strconcat(OpcodeStr,
6935 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6936 !strconcat(OpcodeStr,
6937 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6939 (OpVT (OpNode RC:$src1,
6940 (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
6941 Sched<[itins.Sched.Folded, ReadAfterLd]>;
6944 let Predicates = [HasAVX] in {
6945 let isCommutable = 0 in {
6946 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6947 VR128, loadv2i64, i128mem, 0,
6948 DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
6951 let ExeDomain = SSEPackedSingle in {
6952 defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6953 VR128, loadv4f32, f128mem, 0,
6954 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
6955 defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6956 VR256, loadv8f32, f256mem, 0,
6957 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
6959 let ExeDomain = SSEPackedDouble in {
6960 defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6961 VR128, loadv2f64, f128mem, 0,
6962 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
6963 defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6964 VR256, loadv4f64, f256mem, 0,
6965 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
6967 defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6968 VR128, loadv2i64, i128mem, 0,
6969 DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
6971 let ExeDomain = SSEPackedSingle in
6972 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6973 VR128, loadv4f32, f128mem, 0,
6974 SSE_DPPS_ITINS>, VEX_4V;
6975 let ExeDomain = SSEPackedDouble in
6976 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6977 VR128, loadv2f64, f128mem, 0,
6978 SSE_DPPS_ITINS>, VEX_4V;
6979 let ExeDomain = SSEPackedSingle in
6980 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6981 VR256, loadv8f32, i256mem, 0,
6982 SSE_DPPS_ITINS>, VEX_4V, VEX_L;
6985 let Predicates = [HasAVX2] in {
6986 let isCommutable = 0 in {
6987 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6988 VR256, loadv4i64, i256mem, 0,
6989 DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
6991 defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6992 VR256, loadv4i64, i256mem, 0,
6993 DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
6996 let Constraints = "$src1 = $dst" in {
6997 let isCommutable = 0 in {
6998 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6999 VR128, memopv2i64, i128mem,
7000 1, SSE_MPSADBW_ITINS>;
7002 let ExeDomain = SSEPackedSingle in
7003 defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
7004 VR128, memopv4f32, f128mem,
7005 1, SSE_INTALU_ITINS_FBLEND_P>;
7006 let ExeDomain = SSEPackedDouble in
7007 defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
7008 VR128, memopv2f64, f128mem,
7009 1, SSE_INTALU_ITINS_FBLEND_P>;
7010 defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
7011 VR128, memopv2i64, i128mem,
7012 1, SSE_INTALU_ITINS_BLEND_P>;
7013 let ExeDomain = SSEPackedSingle in
7014 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
7015 VR128, memopv4f32, f128mem, 1,
7017 let ExeDomain = SSEPackedDouble in
7018 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
7019 VR128, memopv2f64, f128mem, 1,
7023 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
7024 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
7025 RegisterClass RC, X86MemOperand x86memop,
7026 PatFrag mem_frag, Intrinsic IntId,
7027 X86FoldableSchedWrite Sched> {
7028 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
7029 (ins RC:$src1, RC:$src2, RC:$src3),
7030 !strconcat(OpcodeStr,
7031 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7032 [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
7033 NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
7036 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
7037 (ins RC:$src1, x86memop:$src2, RC:$src3),
7038 !strconcat(OpcodeStr,
7039 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7041 (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
7043 NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
7044 Sched<[Sched.Folded, ReadAfterLd]>;
7047 let Predicates = [HasAVX] in {
7048 let ExeDomain = SSEPackedDouble in {
7049 defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
7050 loadv2f64, int_x86_sse41_blendvpd,
7052 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
7053 loadv4f64, int_x86_avx_blendv_pd_256,
7054 WriteFVarBlend>, VEX_L;
7055 } // ExeDomain = SSEPackedDouble
7056 let ExeDomain = SSEPackedSingle in {
7057 defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
7058 loadv4f32, int_x86_sse41_blendvps,
7060 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
7061 loadv8f32, int_x86_avx_blendv_ps_256,
7062 WriteFVarBlend>, VEX_L;
7063 } // ExeDomain = SSEPackedSingle
7064 defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
7065 loadv2i64, int_x86_sse41_pblendvb,
7069 let Predicates = [HasAVX2] in {
7070 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
7071 loadv4i64, int_x86_avx2_pblendvb,
7072 WriteVarBlend>, VEX_L;
7075 let Predicates = [HasAVX] in {
7076 def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
7077 (v16i8 VR128:$src2))),
7078 (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7079 def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
7080 (v4i32 VR128:$src2))),
7081 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7082 def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
7083 (v4f32 VR128:$src2))),
7084 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7085 def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
7086 (v2i64 VR128:$src2))),
7087 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7088 def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
7089 (v2f64 VR128:$src2))),
7090 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7091 def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
7092 (v8i32 VR256:$src2))),
7093 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7094 def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
7095 (v8f32 VR256:$src2))),
7096 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7097 def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
7098 (v4i64 VR256:$src2))),
7099 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7100 def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
7101 (v4f64 VR256:$src2))),
7102 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7105 let Predicates = [HasAVX2] in {
7106 def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
7107 (v32i8 VR256:$src2))),
7108 (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7112 // FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
7113 // on targets where they have equal performance. These were changed to use
7114 // blends because blends have better throughput on SandyBridge and Haswell, but
7115 // movs[s/d] are 1-2 byte shorter instructions.
7116 let Predicates = [UseAVX] in {
7117 let AddedComplexity = 15 in {
7118 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
7119 // MOVS{S,D} to the lower bits.
7120 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
7121 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
7122 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
7123 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
7124 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
7125 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
7126 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
7127 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
7129 // Move low f32 and clear high bits.
7130 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
7131 (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
7133 // Move low f64 and clear high bits.
7134 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
7135 (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
7138 // These will incur an FP/int domain crossing penalty, but it may be the only
7139 // way without AVX2. Do not add any complexity because we may be able to match
7140 // more optimal patterns defined earlier in this file.
7141 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
7142 (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
7143 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
7144 (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
7147 // FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
7148 // on targets where they have equal performance. These were changed to use
7149 // blends because blends have better throughput on SandyBridge and Haswell, but
7150 // movs[s/d] are 1-2 byte shorter instructions.
7151 let Predicates = [UseSSE41], AddedComplexity = 15 in {
7152 // With SSE41 we can use blends for these patterns.
7153 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
7154 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
7155 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
7156 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
7160 /// SS41I_ternary_int - SSE 4.1 ternary operator
7161 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
7162 multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7163 X86MemOperand x86memop, Intrinsic IntId,
7164 OpndItins itins = DEFAULT_ITINS> {
7165 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
7166 (ins VR128:$src1, VR128:$src2),
7167 !strconcat(OpcodeStr,
7168 "\t{$src2, $dst|$dst, $src2}"),
7169 [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
7170 itins.rr>, Sched<[itins.Sched]>;
7172 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
7173 (ins VR128:$src1, x86memop:$src2),
7174 !strconcat(OpcodeStr,
7175 "\t{$src2, $dst|$dst, $src2}"),
7178 (bitconvert (mem_frag addr:$src2)), XMM0))],
7179 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
7183 let ExeDomain = SSEPackedDouble in
7184 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
7185 int_x86_sse41_blendvpd,
7186 DEFAULT_ITINS_FBLENDSCHED>;
7187 let ExeDomain = SSEPackedSingle in
7188 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
7189 int_x86_sse41_blendvps,
7190 DEFAULT_ITINS_FBLENDSCHED>;
7191 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
7192 int_x86_sse41_pblendvb,
7193 DEFAULT_ITINS_VARBLENDSCHED>;
7195 // Aliases with the implicit xmm0 argument
7196 def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7197 (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
7198 def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7199 (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
7200 def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7201 (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
7202 def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7203 (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
7204 def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7205 (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
7206 def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7207 (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
7209 let Predicates = [UseSSE41] in {
7210 def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
7211 (v16i8 VR128:$src2))),
7212 (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
7213 def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
7214 (v4i32 VR128:$src2))),
7215 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7216 def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
7217 (v4f32 VR128:$src2))),
7218 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7219 def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
7220 (v2i64 VR128:$src2))),
7221 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7222 def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
7223 (v2f64 VR128:$src2))),
7224 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7227 let AddedComplexity = 400 in { // Prefer non-temporal versions
7228 let SchedRW = [WriteLoad] in {
7229 let Predicates = [HasAVX, NoVLX] in
7230 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7231 "vmovntdqa\t{$src, $dst|$dst, $src}",
7232 [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
7234 let Predicates = [HasAVX2, NoVLX] in
7235 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
7236 "vmovntdqa\t{$src, $dst|$dst, $src}",
7237 [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
7239 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7240 "movntdqa\t{$src, $dst|$dst, $src}",
7241 [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
7244 let Predicates = [HasAVX2, NoVLX] in {
7245 def : Pat<(v8f32 (alignednontemporalload addr:$src)),
7246 (VMOVNTDQAYrm addr:$src)>;
7247 def : Pat<(v4f64 (alignednontemporalload addr:$src)),
7248 (VMOVNTDQAYrm addr:$src)>;
7249 def : Pat<(v4i64 (alignednontemporalload addr:$src)),
7250 (VMOVNTDQAYrm addr:$src)>;
7253 let Predicates = [HasAVX, NoVLX] in {
7254 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
7255 (VMOVNTDQArm addr:$src)>;
7256 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
7257 (VMOVNTDQArm addr:$src)>;
7258 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
7259 (VMOVNTDQArm addr:$src)>;
7262 let Predicates = [UseSSE41] in {
7263 def : Pat<(v4f32 (alignednontemporalload addr:$src)),
7264 (MOVNTDQArm addr:$src)>;
7265 def : Pat<(v2f64 (alignednontemporalload addr:$src)),
7266 (MOVNTDQArm addr:$src)>;
7267 def : Pat<(v2i64 (alignednontemporalload addr:$src)),
7268 (MOVNTDQArm addr:$src)>;
7271 } // AddedComplexity
7273 //===----------------------------------------------------------------------===//
7274 // SSE4.2 - Compare Instructions
7275 //===----------------------------------------------------------------------===//
7277 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
7278 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7279 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
7280 X86MemOperand x86memop, bit Is2Addr = 1> {
7281 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
7282 (ins RC:$src1, RC:$src2),
7284 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7285 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7286 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
7287 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
7288 (ins RC:$src1, x86memop:$src2),
7290 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7291 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7293 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
7296 let Predicates = [HasAVX] in
7297 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
7298 loadv2i64, i128mem, 0>, VEX_4V;
7300 let Predicates = [HasAVX2] in
7301 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
7302 loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
7304 let Constraints = "$src1 = $dst" in
7305 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
7306 memopv2i64, i128mem>;
7308 //===----------------------------------------------------------------------===//
7309 // SSE4.2 - String/text Processing Instructions
7310 //===----------------------------------------------------------------------===//
7312 // Packed Compare Implicit Length Strings, Return Mask
7313 multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
7314 def REG : PseudoI<(outs VR128:$dst),
7315 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7316 [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
7318 def MEM : PseudoI<(outs VR128:$dst),
7319 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7320 [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
7321 (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
7324 let Defs = [EFLAGS], usesCustomInserter = 1 in {
7325 defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
7327 defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
7328 Requires<[UseSSE42]>;
7331 multiclass pcmpistrm_SS42AI<string asm> {
7332 def rr : SS42AI<0x62, MRMSrcReg, (outs),
7333 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7334 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7335 []>, Sched<[WritePCmpIStrM]>;
7337 def rm :SS42AI<0x62, MRMSrcMem, (outs),
7338 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7339 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7340 []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
7343 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
7344 let Predicates = [HasAVX] in
7345 defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
7346 defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ;
7349 // Packed Compare Explicit Length Strings, Return Mask
7350 multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
7351 def REG : PseudoI<(outs VR128:$dst),
7352 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7353 [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
7354 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7355 def MEM : PseudoI<(outs VR128:$dst),
7356 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7357 [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
7358 (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
7361 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7362 defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
7364 defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
7365 Requires<[UseSSE42]>;
7368 multiclass SS42AI_pcmpestrm<string asm> {
7369 def rr : SS42AI<0x60, MRMSrcReg, (outs),
7370 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7371 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7372 []>, Sched<[WritePCmpEStrM]>;
7374 def rm : SS42AI<0x60, MRMSrcMem, (outs),
7375 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7376 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7377 []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
7380 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
7381 let Predicates = [HasAVX] in
7382 defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
7383 defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">;
7386 // Packed Compare Implicit Length Strings, Return Index
7387 multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
7388 def REG : PseudoI<(outs GR32:$dst),
7389 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7390 [(set GR32:$dst, EFLAGS,
7391 (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
7392 def MEM : PseudoI<(outs GR32:$dst),
7393 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7394 [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
7395 (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
7398 let Defs = [EFLAGS], usesCustomInserter = 1 in {
7399 defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
7401 defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
7402 Requires<[UseSSE42]>;
7405 multiclass SS42AI_pcmpistri<string asm> {
7406 def rr : SS42AI<0x63, MRMSrcReg, (outs),
7407 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7408 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7409 []>, Sched<[WritePCmpIStrI]>;
7411 def rm : SS42AI<0x63, MRMSrcMem, (outs),
7412 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7413 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7414 []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
7417 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
7418 let Predicates = [HasAVX] in
7419 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
7420 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
7423 // Packed Compare Explicit Length Strings, Return Index
7424 multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
7425 def REG : PseudoI<(outs GR32:$dst),
7426 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7427 [(set GR32:$dst, EFLAGS,
7428 (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7429 def MEM : PseudoI<(outs GR32:$dst),
7430 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7431 [(set GR32:$dst, EFLAGS,
7432 (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
7436 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7437 defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
7439 defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
7440 Requires<[UseSSE42]>;
7443 multiclass SS42AI_pcmpestri<string asm> {
7444 def rr : SS42AI<0x61, MRMSrcReg, (outs),
7445 (ins VR128:$src1, VR128:$src3, u8imm:$src5),
7446 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7447 []>, Sched<[WritePCmpEStrI]>;
7449 def rm : SS42AI<0x61, MRMSrcMem, (outs),
7450 (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
7451 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7452 []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
7455 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
7456 let Predicates = [HasAVX] in
7457 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
7458 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
7461 //===----------------------------------------------------------------------===//
7462 // SSE4.2 - CRC Instructions
7463 //===----------------------------------------------------------------------===//
7465 // No CRC instructions have AVX equivalents
7467 // crc intrinsic instruction
7468 // This set of instructions are only rm, the only difference is the size
7470 class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
7471 RegisterClass RCIn, SDPatternOperator Int> :
7472 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
7473 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7474 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
7477 class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
7478 X86MemOperand x86memop, SDPatternOperator Int> :
7479 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
7480 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7481 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
7482 IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
7484 let Constraints = "$src1 = $dst" in {
7485 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
7486 int_x86_sse42_crc32_32_8>;
7487 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
7488 int_x86_sse42_crc32_32_8>;
7489 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
7490 int_x86_sse42_crc32_32_16>, OpSize16;
7491 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
7492 int_x86_sse42_crc32_32_16>, OpSize16;
7493 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
7494 int_x86_sse42_crc32_32_32>, OpSize32;
7495 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
7496 int_x86_sse42_crc32_32_32>, OpSize32;
7497 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
7498 int_x86_sse42_crc32_64_64>, REX_W;
7499 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
7500 int_x86_sse42_crc32_64_64>, REX_W;
7501 let hasSideEffects = 0 in {
7503 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
7505 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
7510 //===----------------------------------------------------------------------===//
7511 // SHA-NI Instructions
7512 //===----------------------------------------------------------------------===//
7514 multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
7516 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
7517 (ins VR128:$src1, VR128:$src2),
7518 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7520 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
7521 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
7523 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
7524 (ins VR128:$src1, i128mem:$src2),
7525 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7527 (set VR128:$dst, (IntId VR128:$src1,
7528 (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
7529 (set VR128:$dst, (IntId VR128:$src1,
7530 (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
7533 let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
7534 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
7535 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7536 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7538 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
7539 (i8 imm:$src3)))]>, TA;
7540 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
7541 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7542 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7544 (int_x86_sha1rnds4 VR128:$src1,
7545 (bc_v4i32 (memopv2i64 addr:$src2)),
7546 (i8 imm:$src3)))]>, TA;
7548 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
7549 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
7550 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
7553 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
7555 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
7556 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
7559 // Aliases with explicit %xmm0
7560 def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7561 (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
7562 def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7563 (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
7565 //===----------------------------------------------------------------------===//
7566 // AES-NI Instructions
7567 //===----------------------------------------------------------------------===//
7569 multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
7570 PatFrag ld_frag, bit Is2Addr = 1> {
7571 def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
7572 (ins VR128:$src1, VR128:$src2),
7574 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7575 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7576 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
7577 Sched<[WriteAESDecEnc]>;
7578 def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
7579 (ins VR128:$src1, i128mem:$src2),
7581 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7582 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7584 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
7585 Sched<[WriteAESDecEncLd, ReadAfterLd]>;
7588 // Perform One Round of an AES Encryption/Decryption Flow
7589 let Predicates = [HasAVX, HasAES] in {
7590 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
7591 int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
7592 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
7593 int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
7594 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
7595 int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
7596 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
7597 int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
7600 let Constraints = "$src1 = $dst" in {
7601 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
7602 int_x86_aesni_aesenc, memopv2i64>;
7603 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
7604 int_x86_aesni_aesenclast, memopv2i64>;
7605 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
7606 int_x86_aesni_aesdec, memopv2i64>;
7607 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
7608 int_x86_aesni_aesdeclast, memopv2i64>;
7611 // Perform the AES InvMixColumn Transformation
7612 let Predicates = [HasAVX, HasAES] in {
7613 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7615 "vaesimc\t{$src1, $dst|$dst, $src1}",
7617 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
7619 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7620 (ins i128mem:$src1),
7621 "vaesimc\t{$src1, $dst|$dst, $src1}",
7622 [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
7623 Sched<[WriteAESIMCLd]>, VEX;
7625 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7627 "aesimc\t{$src1, $dst|$dst, $src1}",
7629 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
7630 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7631 (ins i128mem:$src1),
7632 "aesimc\t{$src1, $dst|$dst, $src1}",
7633 [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
7634 Sched<[WriteAESIMCLd]>;
7636 // AES Round Key Generation Assist
7637 let Predicates = [HasAVX, HasAES] in {
7638 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7639 (ins VR128:$src1, u8imm:$src2),
7640 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7642 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7643 Sched<[WriteAESKeyGen]>, VEX;
7644 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7645 (ins i128mem:$src1, u8imm:$src2),
7646 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7648 (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
7649 Sched<[WriteAESKeyGenLd]>, VEX;
7651 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7652 (ins VR128:$src1, u8imm:$src2),
7653 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7655 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7656 Sched<[WriteAESKeyGen]>;
7657 def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7658 (ins i128mem:$src1, u8imm:$src2),
7659 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7661 (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
7662 Sched<[WriteAESKeyGenLd]>;
7664 //===----------------------------------------------------------------------===//
7665 // PCLMUL Instructions
7666 //===----------------------------------------------------------------------===//
7668 // AVX carry-less Multiplication instructions
7669 let isCommutable = 1 in
7670 def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7671 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7672 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7674 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
7675 Sched<[WriteCLMul]>;
7677 def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7678 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7679 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7680 [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7681 (loadv2i64 addr:$src2), imm:$src3))]>,
7682 Sched<[WriteCLMulLd, ReadAfterLd]>;
7684 // Carry-less Multiplication instructions
7685 let Constraints = "$src1 = $dst" in {
7686 let isCommutable = 1 in
7687 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7688 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
7689 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7691 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
7692 IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
7694 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7695 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
7696 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7697 [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7698 (memopv2i64 addr:$src2), imm:$src3))],
7699 IIC_SSE_PCLMULQDQ_RM>,
7700 Sched<[WriteCLMulLd, ReadAfterLd]>;
7701 } // Constraints = "$src1 = $dst"
7704 multiclass pclmul_alias<string asm, int immop> {
7705 def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7706 (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
7708 def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7709 (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
7711 def : InstAlias<!strconcat("vpclmul", asm,
7712 "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7713 (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
7716 def : InstAlias<!strconcat("vpclmul", asm,
7717 "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7718 (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
7721 defm : pclmul_alias<"hqhq", 0x11>;
7722 defm : pclmul_alias<"hqlq", 0x01>;
7723 defm : pclmul_alias<"lqhq", 0x10>;
7724 defm : pclmul_alias<"lqlq", 0x00>;
7726 //===----------------------------------------------------------------------===//
7727 // SSE4A Instructions
7728 //===----------------------------------------------------------------------===//
7730 let Predicates = [HasSSE4A] in {
7732 let ExeDomain = SSEPackedInt in {
7733 let Constraints = "$src = $dst" in {
7734 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
7735 (ins VR128:$src, u8imm:$len, u8imm:$idx),
7736 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7737 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
7739 def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
7740 (ins VR128:$src, VR128:$mask),
7741 "extrq\t{$mask, $src|$src, $mask}",
7742 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7743 VR128:$mask))]>, PD;
7745 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7746 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7747 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7748 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7749 imm:$len, imm:$idx))]>, XD;
7750 def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
7751 (ins VR128:$src, VR128:$mask),
7752 "insertq\t{$mask, $src|$src, $mask}",
7753 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7754 VR128:$mask))]>, XD;
7756 } // ExeDomain = SSEPackedInt
7758 // Non-temporal (unaligned) scalar stores.
7759 let AddedComplexity = 400 in { // Prefer non-temporal versions
7760 let mayStore = 1, SchedRW = [WriteStore] in {
7761 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7762 "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS;
7764 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7765 "movntsd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XD;
7768 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
7769 (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
7771 def : Pat<(nontemporalstore FR64:$src, addr:$dst),
7772 (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
7774 } // AddedComplexity
7777 //===----------------------------------------------------------------------===//
7779 //===----------------------------------------------------------------------===//
7781 //===----------------------------------------------------------------------===//
7782 // VBROADCAST - Load from memory and broadcast to all elements of the
7783 // destination operand
7785 class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7786 X86MemOperand x86memop, ValueType VT,
7787 PatFrag ld_frag, SchedWrite Sched> :
7788 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7789 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7790 [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
7791 Sched<[Sched]>, VEX;
7793 // AVX2 adds register forms
7794 class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7795 ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7796 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7797 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7798 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7799 Sched<[Sched]>, VEX;
7801 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
7802 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7803 f32mem, v4f32, loadf32, WriteLoad>;
7804 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7805 f32mem, v8f32, loadf32,
7806 WriteFShuffleLd>, VEX_L;
7808 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7809 def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7810 v4f64, loadf64, WriteFShuffleLd>, VEX_L;
7812 let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7813 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7814 v4f32, v4f32, WriteFShuffle>;
7815 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7816 v8f32, v4f32, WriteFShuffle256>, VEX_L;
7818 let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7819 def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7820 v4f64, v2f64, WriteFShuffle256>, VEX_L;
7822 //===----------------------------------------------------------------------===//
7823 // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7824 // halves of a 256-bit vector.
7826 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7827 def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7829 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7830 Sched<[WriteLoad]>, VEX, VEX_L;
7832 let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
7833 def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7835 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7836 Sched<[WriteFShuffleLd]>, VEX, VEX_L;
7838 let Predicates = [HasAVX2, NoVLX] in {
7839 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7840 (VBROADCASTI128 addr:$src)>;
7841 def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
7842 (VBROADCASTI128 addr:$src)>;
7843 def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
7844 (VBROADCASTI128 addr:$src)>;
7845 def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
7846 (VBROADCASTI128 addr:$src)>;
7849 let Predicates = [HasAVX, NoVLX] in {
7850 def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
7851 (VBROADCASTF128 addr:$src)>;
7852 def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
7853 (VBROADCASTF128 addr:$src)>;
7856 let Predicates = [HasAVX1Only] in {
7857 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7858 (VBROADCASTF128 addr:$src)>;
7859 def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
7860 (VBROADCASTF128 addr:$src)>;
7861 def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
7862 (VBROADCASTF128 addr:$src)>;
7863 def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
7864 (VBROADCASTF128 addr:$src)>;
7867 //===----------------------------------------------------------------------===//
7868 // VINSERTF128 - Insert packed floating-point values
7870 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7871 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7872 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7873 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7874 []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
7876 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7877 (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7878 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7879 []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
7882 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7883 PatFrag memop_frag> {
7884 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7886 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7887 (INSERT_get_vinsert128_imm VR256:$ins))>;
7888 def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7889 (From (bitconvert (memop_frag addr:$src2))),
7891 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7892 (INSERT_get_vinsert128_imm VR256:$ins))>;
7895 let Predicates = [HasAVX, NoVLX] in {
7896 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7897 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7900 let Predicates = [HasAVX1Only] in {
7901 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
7902 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>;
7903 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
7904 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>;
7907 //===----------------------------------------------------------------------===//
7908 // VEXTRACTF128 - Extract packed floating-point values
7910 let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7911 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7912 (ins VR256:$src1, u8imm:$src2),
7913 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7914 []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
7916 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7917 (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7918 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7919 []>, Sched<[WriteStore]>, VEX, VEX_L;
7922 multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7923 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7924 (To (!cast<Instruction>(InstrStr#rr)
7926 (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7927 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7928 (iPTR imm))), addr:$dst),
7929 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7930 (EXTRACT_get_vextract128_imm VR128:$ext))>;
7934 let Predicates = [HasAVX, NoVLX] in {
7935 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7936 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7939 let Predicates = [HasAVX1Only] in {
7940 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
7941 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
7942 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7943 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
7946 //===----------------------------------------------------------------------===//
7947 // VMASKMOV - Conditional SIMD Packed Loads and Stores
7949 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7950 Intrinsic IntLd, Intrinsic IntLd256,
7951 Intrinsic IntSt, Intrinsic IntSt256> {
7952 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7953 (ins VR128:$src1, f128mem:$src2),
7954 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7955 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7957 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7958 (ins VR256:$src1, f256mem:$src2),
7959 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7960 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7962 def mr : AVX8I<opc_mr, MRMDestMem, (outs),
7963 (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7964 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7965 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
7966 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7967 (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7968 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7969 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
7972 let ExeDomain = SSEPackedSingle in
7973 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7974 int_x86_avx_maskload_ps,
7975 int_x86_avx_maskload_ps_256,
7976 int_x86_avx_maskstore_ps,
7977 int_x86_avx_maskstore_ps_256>;
7978 let ExeDomain = SSEPackedDouble in
7979 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7980 int_x86_avx_maskload_pd,
7981 int_x86_avx_maskload_pd_256,
7982 int_x86_avx_maskstore_pd,
7983 int_x86_avx_maskstore_pd_256>;
7985 //===----------------------------------------------------------------------===//
7986 // VPERMIL - Permute Single and Double Floating-Point Values
7988 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7989 RegisterClass RC, X86MemOperand x86memop_f,
7990 X86MemOperand x86memop_i, PatFrag i_frag,
7991 ValueType f_vt, ValueType i_vt> {
7992 let Predicates = [HasAVX, NoVLX] in {
7993 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7994 (ins RC:$src1, RC:$src2),
7995 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7996 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7997 Sched<[WriteFShuffle]>;
7998 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7999 (ins RC:$src1, x86memop_i:$src2),
8000 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8001 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
8002 (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
8003 Sched<[WriteFShuffleLd, ReadAfterLd]>;
8005 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
8006 (ins RC:$src1, u8imm:$src2),
8007 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8008 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
8009 Sched<[WriteFShuffle]>;
8010 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
8011 (ins x86memop_f:$src1, u8imm:$src2),
8012 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8014 (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
8015 Sched<[WriteFShuffleLd]>;
8016 }// Predicates = [HasAVX, NoVLX]
8019 let ExeDomain = SSEPackedSingle in {
8020 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
8021 loadv2i64, v4f32, v4i32>;
8022 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
8023 loadv4i64, v8f32, v8i32>, VEX_L;
8025 let ExeDomain = SSEPackedDouble in {
8026 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
8027 loadv2i64, v2f64, v2i64>;
8028 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
8029 loadv4i64, v4f64, v4i64>, VEX_L;
8032 let Predicates = [HasAVX, NoVLX] in {
8033 def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
8034 (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
8035 def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
8036 (VPERMILPSYrm VR256:$src1, addr:$src2)>;
8037 def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
8038 (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
8039 def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
8040 (VPERMILPDYrm VR256:$src1, addr:$src2)>;
8042 def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
8043 (VPERMILPSYri VR256:$src1, imm:$imm)>;
8044 def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
8045 (VPERMILPDYri VR256:$src1, imm:$imm)>;
8046 def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
8048 (VPERMILPSYmi addr:$src1, imm:$imm)>;
8049 def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
8050 (VPERMILPDYmi addr:$src1, imm:$imm)>;
8052 def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
8053 (VPERMILPSrr VR128:$src1, VR128:$src2)>;
8054 def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
8055 (VPERMILPSrm VR128:$src1, addr:$src2)>;
8056 def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
8057 (VPERMILPDrr VR128:$src1, VR128:$src2)>;
8058 def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
8059 (VPERMILPDrm VR128:$src1, addr:$src2)>;
8061 def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
8062 (VPERMILPDri VR128:$src1, imm:$imm)>;
8063 def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
8064 (VPERMILPDmi addr:$src1, imm:$imm)>;
8067 //===----------------------------------------------------------------------===//
8068 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
8070 let ExeDomain = SSEPackedSingle in {
8071 let isCommutable = 1 in
8072 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
8073 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
8074 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8075 [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8076 (i8 imm:$src3))))]>, VEX_4V, VEX_L,
8077 Sched<[WriteFShuffle]>;
8078 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
8079 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
8080 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8081 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
8082 (i8 imm:$src3)))]>, VEX_4V, VEX_L,
8083 Sched<[WriteFShuffleLd, ReadAfterLd]>;
8086 let Predicates = [HasAVX] in {
8087 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8088 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8089 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
8090 (loadv4f64 addr:$src2), (i8 imm:$imm))),
8091 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8094 let Predicates = [HasAVX1Only] in {
8095 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8096 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8097 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8098 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8099 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8100 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8101 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8102 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8104 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
8105 (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8106 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8107 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
8108 (loadv4i64 addr:$src2), (i8 imm:$imm))),
8109 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8110 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
8111 (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8112 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8113 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8114 (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8115 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8118 //===----------------------------------------------------------------------===//
8119 // VZERO - Zero YMM registers
8121 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
8122 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
8123 // Zero All YMM registers
8124 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
8125 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
8127 // Zero Upper bits of YMM registers
8128 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
8129 [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
8132 //===----------------------------------------------------------------------===//
8133 // Half precision conversion instructions
8134 //===----------------------------------------------------------------------===//
8135 multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8136 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
8137 "vcvtph2ps\t{$src, $dst|$dst, $src}",
8138 [(set RC:$dst, (Int VR128:$src))]>,
8139 T8PD, VEX, Sched<[WriteCvtF2F]>;
8140 let hasSideEffects = 0, mayLoad = 1 in
8141 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
8142 "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
8143 Sched<[WriteCvtF2FLd]>;
8146 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8147 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
8148 (ins RC:$src1, i32u8imm:$src2),
8149 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8150 [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
8151 TAPD, VEX, Sched<[WriteCvtF2F]>;
8152 let hasSideEffects = 0, mayStore = 1,
8153 SchedRW = [WriteCvtF2FLd, WriteRMW] in
8154 def mr : Ii8<0x1D, MRMDestMem, (outs),
8155 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
8156 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8160 let Predicates = [HasF16C] in {
8161 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
8162 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
8163 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
8164 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
8166 // Pattern match vcvtph2ps of a scalar i64 load.
8167 def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
8168 (VCVTPH2PSrm addr:$src)>;
8169 def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
8170 (VCVTPH2PSrm addr:$src)>;
8171 def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
8172 (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
8173 (VCVTPH2PSrm addr:$src)>;
8175 def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
8176 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
8178 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
8179 def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
8180 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
8182 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
8183 def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
8185 (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
8188 // Patterns for matching conversions from float to half-float and vice versa.
8189 let Predicates = [HasF16C, NoVLX] in {
8190 // Use MXCSR.RC for rounding instead of explicitly specifying the default
8191 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
8192 // configurations we support (the default). However, falling back to MXCSR is
8193 // more consistent with other instructions, which are always controlled by it.
8194 // It's encoded as 0b100.
8195 def : Pat<(fp_to_f16 FR32:$src),
8196 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
8197 (COPY_TO_REGCLASS FR32:$src, VR128), 4)), sub_16bit))>;
8199 def : Pat<(f16_to_fp GR16:$src),
8200 (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
8201 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
8203 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
8204 (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
8205 (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 4)), FR32)) >;
8208 //===----------------------------------------------------------------------===//
8209 // AVX2 Instructions
8210 //===----------------------------------------------------------------------===//
8212 /// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
8213 multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
8214 ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
8215 X86MemOperand x86memop> {
8216 let isCommutable = 1 in
8217 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
8218 (ins RC:$src1, RC:$src2, u8imm:$src3),
8219 !strconcat(OpcodeStr,
8220 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8221 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
8222 Sched<[WriteBlend]>, VEX_4V;
8223 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
8224 (ins RC:$src1, x86memop:$src2, u8imm:$src3),
8225 !strconcat(OpcodeStr,
8226 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8228 (OpVT (OpNode RC:$src1,
8229 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
8230 Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
8233 defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
8234 VR128, loadv2i64, i128mem>;
8235 defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
8236 VR256, loadv4i64, i256mem>, VEX_L;
8238 //===----------------------------------------------------------------------===//
8239 // VPBROADCAST - Load from memory and broadcast to all elements of the
8240 // destination operand
8242 multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
8243 X86MemOperand x86memop, PatFrag ld_frag,
8244 ValueType OpVT128, ValueType OpVT256, Predicate prd> {
8245 let Predicates = [HasAVX2, prd] in {
8246 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
8247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8249 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
8250 Sched<[WriteShuffle]>, VEX;
8251 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
8252 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8254 (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
8255 Sched<[WriteLoad]>, VEX;
8256 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
8257 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8259 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
8260 Sched<[WriteShuffle256]>, VEX, VEX_L;
8261 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
8262 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8264 (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
8265 Sched<[WriteLoad]>, VEX, VEX_L;
8267 // Provide aliases for broadcast from the same register class that
8268 // automatically does the extract.
8269 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
8270 (!cast<Instruction>(NAME#"Yrr")
8271 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
8275 defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
8276 v16i8, v32i8, NoVLX_Or_NoBWI>;
8277 defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
8278 v8i16, v16i16, NoVLX_Or_NoBWI>;
8279 defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
8280 v4i32, v8i32, NoVLX>;
8281 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
8282 v2i64, v4i64, NoVLX>;
8284 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
8285 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
8286 // This means we'll encounter truncated i32 loads; match that here.
8287 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
8288 (VPBROADCASTWrm addr:$src)>;
8289 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
8290 (VPBROADCASTWYrm addr:$src)>;
8291 def : Pat<(v8i16 (X86VBroadcast
8292 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
8293 (VPBROADCASTWrm addr:$src)>;
8294 def : Pat<(v16i16 (X86VBroadcast
8295 (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
8296 (VPBROADCASTWYrm addr:$src)>;
8299 let Predicates = [HasAVX2] in {
8300 // Provide aliases for broadcast from the same register class that
8301 // automatically does the extract.
8302 def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
8303 (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
8305 def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
8306 (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
8310 let Predicates = [HasAVX2, NoVLX] in {
8311 // Provide fallback in case the load node that is used in the patterns above
8312 // is used by additional users, which prevents the pattern selection.
8313 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8314 (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8315 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8316 (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8317 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8318 (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
8321 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
8322 def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
8323 (VPBROADCASTBrr (COPY_TO_REGCLASS
8324 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
8325 GR8:$src, sub_8bit)),
8327 def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
8328 (VPBROADCASTBYrr (COPY_TO_REGCLASS
8329 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
8330 GR8:$src, sub_8bit)),
8333 def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
8334 (VPBROADCASTWrr (COPY_TO_REGCLASS
8335 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
8336 GR16:$src, sub_16bit)),
8338 def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
8339 (VPBROADCASTWYrr (COPY_TO_REGCLASS
8340 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
8341 GR16:$src, sub_16bit)),
8344 let Predicates = [HasAVX2, NoVLX] in {
8345 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8346 (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8347 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8348 (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8349 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8350 (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8352 // The patterns for VPBROADCASTD are not needed because they would match
8353 // the exact same thing as VBROADCASTSS patterns.
8355 def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
8356 (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8357 // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
8360 // AVX1 broadcast patterns
8361 let Predicates = [HasAVX1Only] in {
8362 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8363 (VBROADCASTSSYrm addr:$src)>;
8364 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8365 (VBROADCASTSDYrm addr:$src)>;
8366 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8367 (VBROADCASTSSrm addr:$src)>;
8370 // Provide fallback in case the load node that is used in the patterns above
8371 // is used by additional users, which prevents the pattern selection.
8372 let Predicates = [HasAVX, NoVLX] in {
8373 // 128bit broadcasts:
8374 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
8375 (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
8378 let Predicates = [HasAVX1Only] in {
8379 def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8380 (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
8381 def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8382 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
8383 (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
8384 (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
8385 def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8386 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
8387 (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
8388 (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
8390 def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8391 (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
8392 def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8393 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
8394 (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
8395 (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
8396 def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8397 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
8398 (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
8399 (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
8401 def : Pat<(v2i64 (X86VBroadcast i64:$src)),
8402 (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8405 //===----------------------------------------------------------------------===//
8406 // VPERM - Permute instructions
8409 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8410 ValueType OpVT, X86FoldableSchedWrite Sched> {
8411 let Predicates = [HasAVX2, NoVLX] in {
8412 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8413 (ins VR256:$src1, VR256:$src2),
8414 !strconcat(OpcodeStr,
8415 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8417 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
8418 Sched<[Sched]>, VEX_4V, VEX_L;
8419 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8420 (ins VR256:$src1, i256mem:$src2),
8421 !strconcat(OpcodeStr,
8422 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8424 (OpVT (X86VPermv VR256:$src1,
8425 (bitconvert (mem_frag addr:$src2)))))]>,
8426 Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
8430 defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
8431 let ExeDomain = SSEPackedSingle in
8432 defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
8434 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8435 ValueType OpVT, X86FoldableSchedWrite Sched> {
8436 let Predicates = [HasAVX2, NoVLX] in {
8437 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
8438 (ins VR256:$src1, u8imm:$src2),
8439 !strconcat(OpcodeStr,
8440 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8442 (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
8443 Sched<[Sched]>, VEX, VEX_L;
8444 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
8445 (ins i256mem:$src1, u8imm:$src2),
8446 !strconcat(OpcodeStr,
8447 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8449 (OpVT (X86VPermi (mem_frag addr:$src1),
8450 (i8 imm:$src2))))]>,
8451 Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
8455 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
8456 WriteShuffle256>, VEX_W;
8457 let ExeDomain = SSEPackedDouble in
8458 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
8459 WriteFShuffle256>, VEX_W;
8461 //===----------------------------------------------------------------------===//
8462 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
8464 let isCommutable = 1 in
8465 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
8466 (ins VR256:$src1, VR256:$src2, u8imm:$src3),
8467 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8468 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8469 (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
8471 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
8472 (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
8473 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8474 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
8476 Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8478 let Predicates = [HasAVX2] in {
8479 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8480 (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8481 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8482 (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8483 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8484 (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8486 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
8488 (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8489 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8490 (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8491 (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8492 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
8494 (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8498 //===----------------------------------------------------------------------===//
8499 // VINSERTI128 - Insert packed integer values
8501 let hasSideEffects = 0 in {
8502 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
8503 (ins VR256:$src1, VR128:$src2, u8imm:$src3),
8504 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8505 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
8507 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
8508 (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
8509 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8510 []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8513 let Predicates = [HasAVX2, NoVLX] in {
8514 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
8515 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>;
8516 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
8517 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>;
8520 //===----------------------------------------------------------------------===//
8521 // VEXTRACTI128 - Extract packed integer values
8523 def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
8524 (ins VR256:$src1, u8imm:$src2),
8525 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8526 Sched<[WriteShuffle256]>, VEX, VEX_L;
8527 let hasSideEffects = 0, mayStore = 1 in
8528 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
8529 (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
8530 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8531 Sched<[WriteStore]>, VEX, VEX_L;
8533 let Predicates = [HasAVX2, NoVLX] in {
8534 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
8535 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
8536 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
8537 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
8540 //===----------------------------------------------------------------------===//
8541 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
8543 multiclass avx2_pmovmask<string OpcodeStr,
8544 Intrinsic IntLd128, Intrinsic IntLd256,
8545 Intrinsic IntSt128, Intrinsic IntSt256> {
8546 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
8547 (ins VR128:$src1, i128mem:$src2),
8548 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8549 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
8550 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
8551 (ins VR256:$src1, i256mem:$src2),
8552 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8553 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8555 def mr : AVX28I<0x8e, MRMDestMem, (outs),
8556 (ins i128mem:$dst, VR128:$src1, VR128:$src2),
8557 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8558 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8559 def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
8560 (ins i256mem:$dst, VR256:$src1, VR256:$src2),
8561 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8562 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8565 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
8566 int_x86_avx2_maskload_d,
8567 int_x86_avx2_maskload_d_256,
8568 int_x86_avx2_maskstore_d,
8569 int_x86_avx2_maskstore_d_256>;
8570 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
8571 int_x86_avx2_maskload_q,
8572 int_x86_avx2_maskload_q_256,
8573 int_x86_avx2_maskstore_q,
8574 int_x86_avx2_maskstore_q_256>, VEX_W;
8576 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
8577 ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
8579 def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
8580 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
8582 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
8583 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
8584 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
8585 (VT (bitconvert (ZeroVT immAllZerosV))))),
8586 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
8587 def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
8588 (!cast<Instruction>(BlendStr#"rr")
8590 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
8593 let Predicates = [HasAVX] in {
8594 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
8595 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
8596 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
8597 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
8599 let Predicates = [HasAVX1Only] in {
8600 // load/store i32/i64 not supported use ps/pd version
8601 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
8602 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
8603 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
8604 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
8606 let Predicates = [HasAVX2] in {
8607 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
8608 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
8609 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
8610 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
8613 //===----------------------------------------------------------------------===//
8614 // SubVector Broadcasts
8615 // Provide fallback in case the load node that is used in the patterns above
8616 // is used by additional users, which prevents the pattern selection.
8618 let Predicates = [HasAVX2, NoVLX] in {
8619 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
8620 (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8621 (v2i64 VR128:$src), 1)>;
8622 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
8623 (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8624 (v4i32 VR128:$src), 1)>;
8625 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
8626 (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8627 (v8i16 VR128:$src), 1)>;
8628 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
8629 (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8630 (v16i8 VR128:$src), 1)>;
8633 let Predicates = [HasAVX, NoVLX] in {
8634 def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
8635 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8636 (v2f64 VR128:$src), 1)>;
8637 def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
8638 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8639 (v4f32 VR128:$src), 1)>;
8642 let Predicates = [HasAVX1Only] in {
8643 def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
8644 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8645 (v2i64 VR128:$src), 1)>;
8646 def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
8647 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8648 (v4i32 VR128:$src), 1)>;
8649 def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
8650 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8651 (v8i16 VR128:$src), 1)>;
8652 def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
8653 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
8654 (v16i8 VR128:$src), 1)>;
8657 //===----------------------------------------------------------------------===//
8658 // Variable Bit Shifts
8660 multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8661 ValueType vt128, ValueType vt256> {
8662 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8663 (ins VR128:$src1, VR128:$src2),
8664 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8666 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8667 VEX_4V, Sched<[WriteVarVecShift]>;
8668 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8669 (ins VR128:$src1, i128mem:$src2),
8670 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8672 (vt128 (OpNode VR128:$src1,
8673 (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
8674 VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8675 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8676 (ins VR256:$src1, VR256:$src2),
8677 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8679 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8680 VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
8681 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8682 (ins VR256:$src1, i256mem:$src2),
8683 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8685 (vt256 (OpNode VR256:$src1,
8686 (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
8687 VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8690 let Predicates = [HasAVX2, NoVLX] in {
8691 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
8692 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
8693 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
8694 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
8695 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
8697 def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
8698 (VPSRAVDrr VR128:$src1, VR128:$src2)>;
8699 def : Pat<(v4i32 (X86vsrav VR128:$src1,
8700 (bitconvert (loadv2i64 addr:$src2)))),
8701 (VPSRAVDrm VR128:$src1, addr:$src2)>;
8702 def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
8703 (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
8704 def : Pat<(v8i32 (X86vsrav VR256:$src1,
8705 (bitconvert (loadv4i64 addr:$src2)))),
8706 (VPSRAVDYrm VR256:$src1, addr:$src2)>;
8711 //===----------------------------------------------------------------------===//
8712 // VGATHER - GATHER Operations
8713 multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8714 X86MemOperand memop128, X86MemOperand memop256> {
8715 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
8716 (ins VR128:$src1, memop128:$src2, VR128:$mask),
8717 !strconcat(OpcodeStr,
8718 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8720 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
8721 (ins RC256:$src1, memop256:$src2, RC256:$mask),
8722 !strconcat(OpcodeStr,
8723 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8727 let mayLoad = 1, hasSideEffects = 0, Constraints
8728 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8730 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;
8731 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;
8732 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;
8733 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;
8735 let ExeDomain = SSEPackedDouble in {
8736 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;
8737 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;
8740 let ExeDomain = SSEPackedSingle in {
8741 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;
8742 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;
8746 //===----------------------------------------------------------------------===//
8747 // Extra selection patterns for FR128, f128, f128mem
8749 // movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
8750 def : Pat<(store (f128 FR128:$src), addr:$dst),
8751 (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
8753 def : Pat<(loadf128 addr:$src),
8754 (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
8756 // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
8757 def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
8759 (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
8762 def : Pat<(X86fand FR128:$src1, FR128:$src2),
8764 (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8765 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8767 def : Pat<(and FR128:$src1, FR128:$src2),
8769 (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8770 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8772 def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
8774 (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
8777 def : Pat<(X86for FR128:$src1, FR128:$src2),
8779 (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8780 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8782 def : Pat<(or FR128:$src1, FR128:$src2),
8784 (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8785 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8787 def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
8789 (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
8792 def : Pat<(X86fxor FR128:$src1, FR128:$src2),
8794 (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8795 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
8797 def : Pat<(xor FR128:$src1, FR128:$src2),
8799 (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
8800 (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;